From e8378a929a924c4b218c86d3cfebdf204254f4f1 Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Thu, 7 Jul 2022 12:45:08 -0500 Subject: [PATCH] Initial public release of rocHPL --- .clang-format | 90 ++ .gitignore | 47 + CMakeLists.txt | 217 ++++ LICENSE | 71 ++ README.md | 151 +++ cmake/Dependencies.cmake | 152 +++ include/hpl.hpp | 64 + include/hpl_auxil.hpp | 90 ++ include/hpl_blas.hpp | 266 ++++ include/hpl_comm.hpp | 94 ++ include/hpl_grid.hpp | 106 ++ include/hpl_misc.hpp | 66 + include/hpl_panel.hpp | 146 +++ include/hpl_pauxil.hpp | 287 +++++ include/hpl_pfact.hpp | 199 +++ include/hpl_pgesv.hpp | 150 +++ include/hpl_pmatgen.hpp | 73 ++ include/hpl_pmisc.hpp | 29 + include/hpl_ptest.hpp | 118 ++ include/hpl_ptimer.hpp | 71 ++ include/hpl_version.hpp.in | 24 + install.sh | 390 ++++++ scripts/HPL.dat | 31 + scripts/mpirun_rochpl.in | 198 +++ scripts/run_rochpl.in | 410 +++++++ src/HPL_InitGPU.cpp | 119 ++ src/HPL_pddriver.cpp | 285 +++++ src/HPL_pdinfo.cpp | 1557 ++++++++++++++++++++++++ src/HPL_pdtest.cpp | 501 ++++++++ src/auxil/HPL_abort.cpp | 74 ++ src/auxil/HPL_dlacpy.cpp | 68 ++ src/auxil/HPL_dlamch.cpp | 763 ++++++++++++ src/auxil/HPL_dlange.cpp | 132 ++ src/auxil/HPL_dlaprnt.cpp | 76 ++ src/auxil/HPL_dlatcpy.cpp | 68 ++ src/auxil/HPL_dlatcpy_device.cpp | 113 ++ src/auxil/HPL_fprintf.cpp | 53 + src/auxil/HPL_warn.cpp | 80 ++ src/blas/HPL_daxpy.cpp | 43 + src/blas/HPL_dgemm.cpp | 65 + src/blas/HPL_dgemv.cpp | 60 + src/blas/HPL_dger.cpp | 47 + src/blas/HPL_dscal.cpp | 41 + src/blas/HPL_idamax.cpp | 64 + src/comm/HPL_all_reduce.cpp | 59 + src/comm/HPL_all_reduce_dmxswp.cpp | 298 +++++ src/comm/HPL_allgatherv.cpp | 128 ++ src/comm/HPL_barrier.cpp | 40 + src/comm/HPL_bcast.cpp | 82 ++ src/comm/HPL_bcast_1rinM.cpp | 109 ++ src/comm/HPL_bcast_1ring.cpp | 99 ++ src/comm/HPL_bcast_2rinM.cpp | 165 +++ src/comm/HPL_bcast_2ring.cpp | 153 +++ src/comm/HPL_bcast_blonM.cpp | 185 +++ src/comm/HPL_bcast_blong.cpp | 161 +++ src/comm/HPL_broadcast.cpp | 58 + src/comm/HPL_recv.cpp | 63 + src/comm/HPL_reduce.cpp | 74 ++ src/comm/HPL_scatterv.cpp | 125 ++ src/comm/HPL_sdrv.cpp | 91 ++ src/comm/HPL_send.cpp | 60 + src/grid/HPL_grid_exit.cpp | 58 + src/grid/HPL_grid_info.cpp | 66 + src/grid/HPL_grid_init.cpp | 190 +++ src/matgen/HPL_pdmatgen.cpp | 262 ++++ src/matgen/HPL_pdrandmat_device.cpp | 201 +++ src/matgen/HPL_xjumpm.cpp | 92 ++ src/panel/HPL_pdpanel_SendToDevice.cpp | 216 ++++ src/panel/HPL_pdpanel_SendToHost.cpp | 28 + src/panel/HPL_pdpanel_bcast.cpp | 56 + src/panel/HPL_pdpanel_disp.cpp | 48 + src/panel/HPL_pdpanel_free.cpp | 56 + src/panel/HPL_pdpanel_init.cpp | 475 ++++++++ src/panel/HPL_pdpanel_new.cpp | 105 ++ src/panel/HPL_pdpanel_wait.cpp | 22 + src/pauxil/HPL_dlaswp00N_device.cpp | 111 ++ src/pauxil/HPL_dlaswp01T_device.cpp | 135 ++ src/pauxil/HPL_dlaswp02T_device.cpp | 106 ++ src/pauxil/HPL_dlaswp03T_device.cpp | 133 ++ src/pauxil/HPL_dlaswp04T_device.cpp | 128 ++ src/pauxil/HPL_dlaswp10N_device.cpp | 91 ++ src/pauxil/HPL_indxg2l.cpp | 96 ++ src/pauxil/HPL_indxg2lp.cpp | 116 ++ src/pauxil/HPL_indxg2p.cpp | 74 ++ src/pauxil/HPL_indxl2g.cpp | 105 ++ src/pauxil/HPL_infog2l.cpp | 280 +++++ src/pauxil/HPL_numroc.cpp | 67 + src/pauxil/HPL_numrocI.cpp | 185 +++ src/pauxil/HPL_pabort.cpp | 85 ++ src/pauxil/HPL_pdlamch.cpp | 87 ++ src/pauxil/HPL_pdlange_device.cpp | 302 +++++ src/pauxil/HPL_pwarn.cpp | 89 ++ src/pfact/HPL_dlocmax.cpp | 110 ++ src/pfact/HPL_dlocswpN.cpp | 150 +++ src/pfact/HPL_dlocswpT.cpp | 150 +++ src/pfact/HPL_pdfact.cpp | 109 ++ src/pfact/HPL_pdmxswp.cpp | 132 ++ src/pfact/HPL_pdpancrN.cpp | 233 ++++ src/pfact/HPL_pdpancrT.cpp | 232 ++++ src/pfact/HPL_pdpanllN.cpp | 224 ++++ src/pfact/HPL_pdpanllT.cpp | 223 ++++ src/pfact/HPL_pdpanrlN.cpp | 228 ++++ src/pfact/HPL_pdpanrlT.cpp | 224 ++++ src/pfact/HPL_pdrpancrN.cpp | 214 ++++ src/pfact/HPL_pdrpancrT.cpp | 213 ++++ src/pfact/HPL_pdrpanllN.cpp | 193 +++ src/pfact/HPL_pdrpanllT.cpp | 193 +++ src/pfact/HPL_pdrpanrlN.cpp | 198 +++ src/pfact/HPL_pdrpanrlT.cpp | 198 +++ src/pgesv/HPL_pdgesv.cpp | 409 +++++++ src/pgesv/HPL_pdlaswp.cpp | 532 ++++++++ src/pgesv/HPL_pdtrsv_device.cpp | 352 ++++++ src/pgesv/HPL_pdupdateNT.cpp | 169 +++ src/pgesv/HPL_pdupdateTT.cpp | 168 +++ src/pgesv/HPL_perm.cpp | 89 ++ src/pgesv/HPL_pipid.cpp | 164 +++ src/pgesv/HPL_piplen.cpp | 58 + src/pgesv/HPL_plindx.cpp | 238 ++++ src/timer/HPL_ptimer.cpp | 262 ++++ src/timer/HPL_ptimer_cputime.cpp | 45 + src/timer/HPL_ptimer_walltime.cpp | 29 + 121 files changed, 19503 insertions(+) create mode 100644 .clang-format create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 LICENSE create mode 100644 README.md create mode 100644 cmake/Dependencies.cmake create mode 100644 include/hpl.hpp create mode 100644 include/hpl_auxil.hpp create mode 100644 include/hpl_blas.hpp create mode 100644 include/hpl_comm.hpp create mode 100644 include/hpl_grid.hpp create mode 100644 include/hpl_misc.hpp create mode 100644 include/hpl_panel.hpp create mode 100644 include/hpl_pauxil.hpp create mode 100644 include/hpl_pfact.hpp create mode 100644 include/hpl_pgesv.hpp create mode 100644 include/hpl_pmatgen.hpp create mode 100644 include/hpl_pmisc.hpp create mode 100644 include/hpl_ptest.hpp create mode 100644 include/hpl_ptimer.hpp create mode 100644 include/hpl_version.hpp.in create mode 100755 install.sh create mode 100644 scripts/HPL.dat create mode 100755 scripts/mpirun_rochpl.in create mode 100755 scripts/run_rochpl.in create mode 100644 src/HPL_InitGPU.cpp create mode 100644 src/HPL_pddriver.cpp create mode 100644 src/HPL_pdinfo.cpp create mode 100644 src/HPL_pdtest.cpp create mode 100644 src/auxil/HPL_abort.cpp create mode 100644 src/auxil/HPL_dlacpy.cpp create mode 100644 src/auxil/HPL_dlamch.cpp create mode 100644 src/auxil/HPL_dlange.cpp create mode 100644 src/auxil/HPL_dlaprnt.cpp create mode 100644 src/auxil/HPL_dlatcpy.cpp create mode 100644 src/auxil/HPL_dlatcpy_device.cpp create mode 100644 src/auxil/HPL_fprintf.cpp create mode 100644 src/auxil/HPL_warn.cpp create mode 100644 src/blas/HPL_daxpy.cpp create mode 100644 src/blas/HPL_dgemm.cpp create mode 100644 src/blas/HPL_dgemv.cpp create mode 100644 src/blas/HPL_dger.cpp create mode 100644 src/blas/HPL_dscal.cpp create mode 100644 src/blas/HPL_idamax.cpp create mode 100644 src/comm/HPL_all_reduce.cpp create mode 100644 src/comm/HPL_all_reduce_dmxswp.cpp create mode 100644 src/comm/HPL_allgatherv.cpp create mode 100644 src/comm/HPL_barrier.cpp create mode 100644 src/comm/HPL_bcast.cpp create mode 100644 src/comm/HPL_bcast_1rinM.cpp create mode 100644 src/comm/HPL_bcast_1ring.cpp create mode 100644 src/comm/HPL_bcast_2rinM.cpp create mode 100644 src/comm/HPL_bcast_2ring.cpp create mode 100644 src/comm/HPL_bcast_blonM.cpp create mode 100644 src/comm/HPL_bcast_blong.cpp create mode 100644 src/comm/HPL_broadcast.cpp create mode 100644 src/comm/HPL_recv.cpp create mode 100644 src/comm/HPL_reduce.cpp create mode 100644 src/comm/HPL_scatterv.cpp create mode 100644 src/comm/HPL_sdrv.cpp create mode 100644 src/comm/HPL_send.cpp create mode 100644 src/grid/HPL_grid_exit.cpp create mode 100644 src/grid/HPL_grid_info.cpp create mode 100644 src/grid/HPL_grid_init.cpp create mode 100644 src/matgen/HPL_pdmatgen.cpp create mode 100644 src/matgen/HPL_pdrandmat_device.cpp create mode 100644 src/matgen/HPL_xjumpm.cpp create mode 100644 src/panel/HPL_pdpanel_SendToDevice.cpp create mode 100644 src/panel/HPL_pdpanel_SendToHost.cpp create mode 100644 src/panel/HPL_pdpanel_bcast.cpp create mode 100644 src/panel/HPL_pdpanel_disp.cpp create mode 100644 src/panel/HPL_pdpanel_free.cpp create mode 100644 src/panel/HPL_pdpanel_init.cpp create mode 100644 src/panel/HPL_pdpanel_new.cpp create mode 100644 src/panel/HPL_pdpanel_wait.cpp create mode 100644 src/pauxil/HPL_dlaswp00N_device.cpp create mode 100644 src/pauxil/HPL_dlaswp01T_device.cpp create mode 100644 src/pauxil/HPL_dlaswp02T_device.cpp create mode 100644 src/pauxil/HPL_dlaswp03T_device.cpp create mode 100644 src/pauxil/HPL_dlaswp04T_device.cpp create mode 100644 src/pauxil/HPL_dlaswp10N_device.cpp create mode 100644 src/pauxil/HPL_indxg2l.cpp create mode 100644 src/pauxil/HPL_indxg2lp.cpp create mode 100644 src/pauxil/HPL_indxg2p.cpp create mode 100644 src/pauxil/HPL_indxl2g.cpp create mode 100644 src/pauxil/HPL_infog2l.cpp create mode 100644 src/pauxil/HPL_numroc.cpp create mode 100644 src/pauxil/HPL_numrocI.cpp create mode 100644 src/pauxil/HPL_pabort.cpp create mode 100644 src/pauxil/HPL_pdlamch.cpp create mode 100644 src/pauxil/HPL_pdlange_device.cpp create mode 100644 src/pauxil/HPL_pwarn.cpp create mode 100644 src/pfact/HPL_dlocmax.cpp create mode 100644 src/pfact/HPL_dlocswpN.cpp create mode 100644 src/pfact/HPL_dlocswpT.cpp create mode 100644 src/pfact/HPL_pdfact.cpp create mode 100644 src/pfact/HPL_pdmxswp.cpp create mode 100644 src/pfact/HPL_pdpancrN.cpp create mode 100644 src/pfact/HPL_pdpancrT.cpp create mode 100644 src/pfact/HPL_pdpanllN.cpp create mode 100644 src/pfact/HPL_pdpanllT.cpp create mode 100644 src/pfact/HPL_pdpanrlN.cpp create mode 100644 src/pfact/HPL_pdpanrlT.cpp create mode 100644 src/pfact/HPL_pdrpancrN.cpp create mode 100644 src/pfact/HPL_pdrpancrT.cpp create mode 100644 src/pfact/HPL_pdrpanllN.cpp create mode 100644 src/pfact/HPL_pdrpanllT.cpp create mode 100644 src/pfact/HPL_pdrpanrlN.cpp create mode 100644 src/pfact/HPL_pdrpanrlT.cpp create mode 100644 src/pgesv/HPL_pdgesv.cpp create mode 100644 src/pgesv/HPL_pdlaswp.cpp create mode 100644 src/pgesv/HPL_pdtrsv_device.cpp create mode 100644 src/pgesv/HPL_pdupdateNT.cpp create mode 100644 src/pgesv/HPL_pdupdateTT.cpp create mode 100644 src/pgesv/HPL_perm.cpp create mode 100644 src/pgesv/HPL_pipid.cpp create mode 100644 src/pgesv/HPL_piplen.cpp create mode 100644 src/pgesv/HPL_plindx.cpp create mode 100644 src/timer/HPL_ptimer.cpp create mode 100644 src/timer/HPL_ptimer_cputime.cpp create mode 100644 src/timer/HPL_ptimer_walltime.cpp diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..d659c15 --- /dev/null +++ b/.clang-format @@ -0,0 +1,90 @@ +--- +Language: Cpp +AccessModifierOffset: 0 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: true +AlignConsecutiveDeclarations: true +AlignEscapedNewlinesLeft: true +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortFunctionsOnASingleLine: true +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: true +BinPackArguments: false +BinPackParameters: false +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + - Regex: '^(<|"(gtest|isl|json)/)' + Priority: 3 + - Regex: '.*' + Priority: 1 +IndentCaseLabels: true +IndentWidth: 2 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: true +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 1000 +PointerAlignment: Left +ReflowComments: true +SortIncludes: false +SpaceAfterCStyleCast: false +# SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: Never +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 2 +UseTab: Never +... + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cdc7121 --- /dev/null +++ b/.gitignore @@ -0,0 +1,47 @@ +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app + +# vim tags +tags +.tags +.*.swp + +# Editors +.vscode + +# build-in-source directory +build + +# doc directory +docBin +_build + +#third-party software +tpl/ +ltmain.sh \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..645594b --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,217 @@ +# Modifications (c) 2018-2022 Advanced Micro Devices, Inc. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +cmake_minimum_required(VERSION 3.10 FATAL_ERROR) + +option(HPL_VERBOSE_PRINT "Enable printing to terminal during run" OFF) +option(HPL_PROGRESS_REPORT "Enable printing progess report to terminal during run" OFF) +option(HPL_DETAILED_TIMING "Enable detailed timers during run" OFF) + +option(ROCM_PATH "Path to ROCm install" /opt/rocm) +option(HPL_BLAS_DIR "Path to CPU BLAS library" ${CMAKE_CURRENT_SOURCE_DIR}/tpl/openblas) +option(HPL_MPI_DIR "Path to MPI library" ${CMAKE_CURRENT_SOURCE_DIR}/tpl/openmpi) + +option(HPL_OPENMPI_UCX "Compile WITH OpenMPI+UCX support." OFF) + +set(CMAKE_INSTALL_PREFIX "rocHPL" CACHE PATH "Install path prefix, prepended onto install directories") + +# CMake modules +list(APPEND CMAKE_MODULE_PATH + ${CMAKE_CURRENT_SOURCE_DIR}/cmake + ${ROCM_PATH}/hip/cmake) + +# Set a default build type if none was specified +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + message(STATUS "Setting build type to 'Release' as none was specified.") + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + +# Honor per-config flags in try_compile() source-file signature. cmake v3.7 and up +if(POLICY CMP0066) + cmake_policy(SET CMP0066 NEW) +endif() + +# rocHPL project +project(rochpl LANGUAGES CXX) + +# Build flags +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) + +# Build options +option(HPL_DEBUG "Compile with modest debugging turned on" OFF) +option(HPL_DETAILED_DEBUG "Compile with voluminous debugging information turned on" OFF) +option(HPL_DETAILED_TIMING "Enable detail timers" OFF) +option(HPL_REFERENCE "Build reference mode" OFF) +option(BUILD_TEST "Build rocHPL single-node test" OFF) + +# Dependencies +include(cmake/Dependencies.cmake) + +# Setup version +rocm_setup_version(VERSION 6.0.0) + +# This option only works for make/nmake and the ninja generators, but no reason it shouldn't be on all the time +# This tells cmake to create a compile_commands.json file that can be used with clang tooling or vim +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# HPL sources +file(GLOB_RECURSE rochpl_source RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "src/*.cpp") + +# HPL device sources +file(GLOB_RECURSE rochpl_device_source RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "src/*_device.cpp") +list(REMOVE_ITEM rochpl_source ${rochpl_device_source}) + +# Flag source files as hip source files +foreach(i ${rochpl_device_source}) + set_source_files_properties(${i} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT TRUE) +endforeach() + +# HIP flags workaround while target_compile_options does not work +list(APPEND HIP_HIPCC_FLAGS "-Wno-unused-command-line-argument -fPIE") +list(APPEND CMAKE_HOST_FLAGS "") + +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND HIP_HIPCC_FLAGS "-g -ggdb") + list(APPEND CMAKE_HOST_FLAGS "-O0;-g") +else() + list(APPEND HIP_HIPCC_FLAGS "-O3 -march=native -ffp-contract=fast -ffast-math -funsafe-math-optimizations") + list(APPEND CMAKE_HOST_FLAGS "-O3;-march=native") +endif() + + +# Availability of rocm_check_target_ids command assures that we can also build +# for gfx90a target +if(COMMAND rocm_check_target_ids) + set(DEFAULT_AMDGPU_TARGETS "gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx908:xnack+;gfx90a:xnack-;gfx90a:xnack+") +else() + set(DEFAULT_AMDGPU_TARGETS "gfx900;gfx906;gfx908;gfx908") +endif() +set(TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target") + +# AMD targets +foreach(target ${TARGETS}) + list(APPEND HIP_HIPCC_FLAGS "--amdgpu-target=${target}") +endforeach() + +# Target executable +hip_add_executable(rochpl ${rochpl_source} ${rochpl_device_source}) + +target_compile_options(rochpl PRIVATE ${CMAKE_HOST_FLAGS}) + +if(HPL_DEBUG) + target_compile_definitions(rochpl PRIVATE HPL_DEBUG) +endif() + +if(HPL_VERBOSE_PRINT) + target_compile_definitions(rochpl PRIVATE HPL_VERBOSE_PRINT) +endif() + +if(HPL_DETAILED_TIMING) + target_compile_definitions(rochpl PRIVATE HPL_DETAILED_TIMING) +endif() + +if(HPL_PROGRESS_REPORT) + target_compile_definitions(rochpl PRIVATE HPL_PROGRESS_REPORT) +endif() + +# Target include directories +target_include_directories(rochpl + PRIVATE + $ + $ + $ + $ + $) + +#HIP +target_link_libraries(rochpl PRIVATE hip::host) + +# MPI +target_link_libraries(rochpl PRIVATE MPI::MPI_CXX) + +# OpenMP +target_link_libraries(rochpl PRIVATE OpenMP::OpenMP_CXX) + +# Target link libraries +target_link_libraries(rochpl PRIVATE BLAS::BLAS) +target_link_libraries(rochpl PRIVATE roc::rocblas) +target_link_libraries(rochpl PRIVATE roc::roctracer) +target_link_libraries(rochpl PRIVATE roc::roctx) + +# Target properties +set_target_properties(rochpl PROPERTIES VERSION ${rochpl_VERSION}) +set_target_properties(rochpl PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") + +set_target_properties(rochpl PROPERTIES LINKER_LANGUAGE CXX) + +target_link_options(rochpl PRIVATE "-fopenmp") + +set_target_properties(rochpl PROPERTIES HIP_ARCHITECTURES "${DEFAULT_AMDGPU_TARGETS}") + +# Configure a header file to pass the rocHPL version +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/include/hpl_version.hpp.in" + "${PROJECT_BINARY_DIR}/include/hpl_version.hpp") + +# Configure run scripts +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/scripts/run_rochpl.in" + "${CMAKE_BINARY_DIR}/run_rochpl" + @ONLY) +configure_file("${CMAKE_CURRENT_SOURCE_DIR}/scripts/mpirun_rochpl.in" + "${CMAKE_BINARY_DIR}/mpirun_rochpl" + @ONLY) + +#move input file +file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/scripts/HPL.dat + DESTINATION ${CMAKE_BINARY_DIR}) + +# Install targets +rocm_install_targets(TARGETS rochpl) + +install(PROGRAMS ${CMAKE_BINARY_DIR}/run_rochpl ${CMAKE_BINARY_DIR}/mpirun_rochpl + DESTINATION ${CMAKE_INSTALL_PREFIX}) +install(FILES ${CMAKE_BINARY_DIR}/HPL.dat + DESTINATION ${CMAKE_INSTALL_PREFIX}) + +# Package specific CPACK vars +set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-dev (>= 3.5.0)") +set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev >= 3.5.0") +set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") + +if(NOT CPACK_PACKAGING_INSTALL_PREFIX) + set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") +endif() + +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" "\${CPACK_PACKAGING_INSTALL_PREFIX}/include") + +# Package name +set(package_name rochpl) + +rocm_create_package( + NAME ${package_name} + DESCRIPTION "Radeon Open Compute HPL application" + MAINTAINER "Noel Chalmers") diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f603960 --- /dev/null +++ b/LICENSE @@ -0,0 +1,71 @@ +====================================================================== + -- High Performance Computing Linpack Benchmark (HPL) + HPL - 2.2 - February 24, 2016 + Antoine P. Petitet + University of Tennessee, Knoxville + Innovative Computing Laboratory + (C) Copyright 2000-2008 All Rights Reserved + + -- Copyright notice and Licensing terms: + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. All advertising materials mentioning features or use of this + software must display the following acknowledgement: + This product includes software developed at the University of + Tennessee, Knoxville, Innovative Computing Laboratory. + + 4. The name of the University, the name of the Laboratory, or the + names of its contributors may not be used to endorse or promote + products derived from this software without specific written + permission. + + -- Disclaimer: + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +====================================================================== + +Modifications (c) 2018-2022 Advanced Micro Devices, Inc. +Modified by: Noel Chalmers + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..bad2d62 --- /dev/null +++ b/README.md @@ -0,0 +1,151 @@ +# rocHPL +rocHPL is a benchmark based on the [HPL][] benchmark application, implemented on top of AMD's Radeon Open Compute [ROCm][] Platform, runtime, and toolchains. rocHPL is created using the [HIP][] programming language and optimized for AMD's latest discrete GPUs. + +## Requirements +* Git +* CMake (3.10 or later) +* MPI (Optional) +* AMD [ROCm] platform (3.5 or later) +* [rocBLAS][] + +## Quickstart rocHPL build and install + +#### Install script +You can build rocHPL using the `install.sh` script +``` +# Clone rocHPL using git +git clone https://github.com/ROCmSoftwarePlatform/rocHPL.git + +# Go to rocHPL directory +cd rocHPL + +# Run install.sh script +# Command line options: +# -h|--help - prints this help message +# -g|--debug - Set build type to Debug (otherwise build Release) +# --prefix= - Path to rocHPL install location (Default: build/rocHPL) +# --with-rocm= - Path to ROCm install (Default: /opt/rocm) +# --with-rocblas= - Path to rocBLAS library (Default: /opt/rocm/rocblas) +# --with-cpublas= - Path to external CPU BLAS library (Default: clone+build AMD BLIS) +# --with-mpi= - Path to external MPI install (Default: clone+build OpenMPI) +# --verbose-print - Verbose output during HPL setup (Default: true) +# --progress-report - Print progress report to terminal during HPL run (Default: true) +# --detailed-timing - Record detailed timers during HPL run (Default: true) +./install.sh +``` +By default, [BLIS] v3.1, [UCX] v1.12.1, and [OpenMPI] v4.1.4 will be cloned and built in rocHPL/tpl. After building, the `rochpl` executable is placed in build/rochpl-install. + +## Running rocHPL benchmark application +rocHPL provides some helpful wrapper scripts. A wrapper script for launching via `mpirun` is provided in `mpirun_rochpl`. This script has two distinct run modes: +``` +mpirun_rochpl -P

-Q

-N --NB -f +# where +# P - is the number of rows in the MPI grid +# Q - is the number of columns in the MPI grid +# N - is the total number of rows/columns of the global matrix +# NB - is the panel size in the blocking algorithm +# frac - is the split-update fraction (imporant for hiding some MPI + communication) +``` +This run script will launch a total of np=PxQ MPI processes. + +The second runmode takes an input file together with a number of MPI processes: +``` +mpirun_rochpl -P

-Q -i -f +# where +# P - is the number of rows in the MPI grid +# Q - is the number of columns in the MPI grid +# input - is the input filename (default HPL.dat) +# frac - is the split-update fraction (important for hiding some MPI + communication) +``` + +The input file accpted by the `rochpl` executable follows the format below: +``` +HPLinpack benchmark input file +Innovative Computing Laboratory, University of Tennessee +HPL.out output file name (if any) +0 device out (6=stdout,7=stderr,file) +1 # of problems sizes (N) +45312 Ns +1 # of NBs +384 NBs +1 PMAP process mapping (0=Row-,1=Column-major) +1 # of process grids (P x Q) +1 Ps +1 Qs +16.0 threshold +1 # of panel fact +2 PFACTs (0=left, 1=Crout, 2=Right) +1 # of recursive stopping criterium +2 NBMINs (>= 1) +1 # of panels in recursion +2 NDIVs +1 # of recursive panel fact. +2 RFACTs (0=left, 1=Crout, 2=Right) +1 # of broadcast +6 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM,6=Ibcast) +1 # of lookahead depth +1 DEPTHs (>=0) +1 SWAP (0=bin-exch,1=long,2=mix) +64 swapping threshold +1 L1 in (0=transposed,1=no-transposed) form +0 U in (0=transposed,1=no-transposed) form +0 Equilibration (0=no,1=yes) +8 memory alignment in double (> 0) +``` + +The `mpirun_rochpl` wraps a second script, `run_rochpl`, wherein some CPU core bindings are determined autmotically based on the node-local MPI grid. Users wishing to launch rocHPL via a workload manager such as slurm may directly use this run script. For example, +``` +srun -N 2 -n 16 run_rochpl -P 4 -Q 4 -N 128000 --NB 512 +``` +When launching to multiple compute nodes, it can be useful to specify the local MPI grid layout on each node. To specify this, the `-p` and `-q` input parameters are used. For example, the srun line above is launching to two compute nodes, each with 8 GPUs. The local MPI grid layout can be specifed as either: +``` +srun -N 2 -n 16 run_rochpl -P 4 -Q 4 -p 2 -q 4 -N 128000 --NB 512 +``` +or +``` +srun -N 2 -n 16 run_rochpl -P 4 -Q 4 -p 4 -q 2 -N 128000 --NB 512 +``` +This helps to control where/how much inter-node communication is occuring. + +## Performance evaluation +rocHPL is typically weak scaled so that the global matrix fills all available VRAM on all GPUs. The matrix size N is usually selected to be a multiple of the blocksize NB. Some sample runs on 32GB MI100 GPUs include: +* 1 MI100: `mpirun_rochpl -P 1 -Q 1 -N 64512 --NB 512` +* 2 MI100: `mpirun_rochpl -P 1 -Q 2 -N 90112 --NB 512` +* 4 MI100: `mpirun_rochpl -P 2 -Q 2 -N 126976 --NB 512` +* 8 MI100: `mpirun_rochpl -P 2 -Q 4 -N 180224 --NB 512` + +Overall performance of the benchmark is measured in 64-bit floating point operations (FLOPs) per second. Performance is reported at the end of the run to the user's specified output (by default the performance is printed to stdout and a results file HPL.out). + +See [the Wiki](../../wiki/Common-rocHPL-run-configurations) for some common run configurations for various AMD Instinct GPUs. + +## Testing rocHPL +At the end of each benchmark run, residual error checking is computed, and PASS or FAIL is printed to output. + +The simplest suite of tests should run configurations from 1 to 4 GPUs to exercise different communcation code paths. For example the tests: +``` +mpirun_rochpl -P 1 -Q 1 -N 45312 +mpirun_rochpl -P 1 -Q 2 -N 45312 +mpirun_rochpl -P 2 -Q 1 -N 45312 +mpirun_rochpl -P 2 -Q 2 -N 45312 +``` +should all report PASSED. + +Please note that for successful testing, a device with at least 16GB of device memory is required. + +## Support +Please use [the issue tracker][] for bugs and feature requests. + +## License +The [license file][] can be found in the main repository. + +[HPL]: http://icl.utk.edu/hpl/ +[ROCm]: https://github.com/RadeonOpenCompute/ROCm +[HIP]: https://github.com/ROCm-Developer-Tools/HIP +[rocBLAS]: https://github.com/ROCmSoftwarePlatform/rocBLAS +[BLIS]: https://github.com/amd/blis +[OpenMPI]: https://github.com/open-mpi/ompi +[UCX]: https://github.com/openucx/ucx +[the issue tracker]: https://github.com/ROCmSoftwarePlatform/rocHPL/issues +[license file]: https://github.com/ROCmSoftwarePlatform/rocHPL diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake new file mode 100644 index 0000000..7cbc012 --- /dev/null +++ b/cmake/Dependencies.cmake @@ -0,0 +1,152 @@ +# Modifications (c) 2019-2022 Advanced Micro Devices, Inc. +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +# IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +# Dependencies + +# Git +find_package(Git REQUIRED) + +#Look for a BLAS lib +# For some reason cmake doesn't let us manually specify a search path in FindBLAS, +# so let's add our own libraries +get_filename_component(HPL_BLAS_DIR ${HPL_BLAS_DIR} ABSOLUTE) + +# Look for BLIS in the provided path +find_library(BLAS_LIBRARIES NAMES blis + PATHS ${HPL_BLAS_DIR} + NO_DEFAULT_PATH) + +if (NOT BLAS_LIBRARIES) + # If we dont find BLIS, look for openblas + find_library(BLAS_LIBRARIES NAMES openblas + PATHS ${HPL_BLAS_DIR} + NO_DEFAULT_PATH) +endif() +if (NOT BLAS_LIBRARIES) + # If we dont find BLIS or openBLAS, look for MKL + find_library(BLAS_LIBRARIES NAMES mkl_core + PATHS ${HPL_BLAS_DIR} + NO_DEFAULT_PATH) + find_library(BLAS_SEQ_LIBRARIES NAMES mkl_sequential + PATHS ${HPL_BLAS_DIR} + NO_DEFAULT_PATH) + find_library(BLAS_LP64_LIBRARIES NAMES mkl_intel_lp64 + PATHS ${HPL_BLAS_DIR} + NO_DEFAULT_PATH) +endif() + +if (BLAS_LIBRARIES) + message(STATUS "Found BLAS: ${BLAS_LIBRARIES}") +else() + # If we still havent found a blas library, maybe cmake will? + find_package(BLAS REQUIRED) +endif() +add_library(BLAS::BLAS IMPORTED INTERFACE) +set_property(TARGET BLAS::BLAS PROPERTY INTERFACE_LINK_LIBRARIES "${BLAS_LP64_LIBRARIES};${BLAS_SEQ_LIBRARIES};${BLAS_LIBRARIES}") + +# Find OpenMP package +find_package(OpenMP) +if (NOT OPENMP_FOUND) + message("-- OpenMP not found. Compiling WITHOUT OpenMP support.") +else() + option(HPL_OPENMP "Compile WITH OpenMP support." ON) +endif() + +# MPI +set(MPI_HOME ${HPL_MPI_DIR}) +find_package(MPI REQUIRED) + +# Add some paths +list(APPEND CMAKE_PREFIX_PATH ${ROCBLAS_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}) + +find_library(ROCTRACER NAMES roctracer64 + PATHS ${ROCM_PATH}/lib + NO_DEFAULT_PATH) +find_library(ROCTX NAMES roctx64 + PATHS ${ROCM_PATH}/lib + NO_DEFAULT_PATH) + +message("-- roctracer: ${ROCTRACER}") +message("-- roctx: ${ROCTX}") + +add_library(roc::roctracer SHARED IMPORTED) +set_target_properties(roc::roctracer PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${ROCM_PATH}/include" + INTERFACE_LINK_LIBRARIES "hip::host" + IMPORTED_LOCATION "${ROCTRACER}" + IMPORTED_SONAME "libroctracer.so") +add_library(roc::roctx SHARED IMPORTED) +set_target_properties(roc::roctx PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${ROCM_PATH}/include" + INTERFACE_LINK_LIBRARIES "hip::host" + IMPORTED_LOCATION "${ROCTX}" + IMPORTED_SONAME "libroctx64.so") + +# Find HIP package +find_package(HIP REQUIRED) + +# rocblas +find_package(rocblas REQUIRED) + +get_target_property(rocblas_LIBRARIES roc::rocblas IMPORTED_LOCATION_RELEASE) + +message("-- rocBLAS version: ${rocblas_VERSION}") +message("-- rocBLAS include dirs: ${rocblas_INCLUDE_DIRS}") +message("-- rocBLAS libraries: ${rocblas_LIBRARIES}") + +get_filename_component(ROCBLAS_LIB_PATH ${rocblas_LIBRARIES} DIRECTORY) + +# ROCm cmake package +find_package(ROCM QUIET CONFIG PATHS ${CMAKE_PREFIX_PATH}) +if(NOT ROCM_FOUND) + set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern) + set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download") + file(DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip + ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS status LOG log) + + list(GET status 0 status_code) + list(GET status 1 status_string) + + if(NOT status_code EQUAL 0) + message(FATAL_ERROR "error: downloading + 'https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip' failed + status_code: ${status_code} + status_string: ${status_string} + log: ${log} + ") + endif() + + execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip + WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}) + + find_package(ROCM REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}) +endif() + +include(ROCMSetupVersion) +include(ROCMCreatePackage) +include(ROCMInstallTargets) +include(ROCMPackageConfigHelpers) +include(ROCMInstallSymlinks) +include(ROCMCheckTargetIds OPTIONAL) diff --git a/include/hpl.hpp b/include/hpl.hpp new file mode 100644 index 0000000..4c99804 --- /dev/null +++ b/include/hpl.hpp @@ -0,0 +1,64 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_HPP +#define HPL_HPP +/* + * --------------------------------------------------------------------- + * HPL default compile options that can overridden in the cmake + * --------------------------------------------------------------------- + */ +#ifndef HPL_DETAILED_TIMING /* Do not enable detailed timings */ +#define HPL_NO_DETAILED_TIMING +#endif + +#undef HPL_USE_COLLECTIVES +//#define HPL_USE_COLLECTIVES + +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ +#include + +// NC: hipcc in ROCm 3.7 complains if __HIP_PLATFORM_HCC__ is defined in the +// compile line +#ifdef __HIPCC__ +#ifdef __HIP_PLATFORM_HCC__ +#undef __HIP_PLATFORM_HCC__ +#endif +#endif +#include "hip/hip_runtime_api.h" + +#include "hpl_version.hpp" +#include "hpl_misc.hpp" +#include "hpl_blas.hpp" +#include "hpl_auxil.hpp" + +#include "hpl_pmisc.hpp" +#include "hpl_pauxil.hpp" +#include "hpl_panel.hpp" +#include "hpl_pfact.hpp" +#include "hpl_pgesv.hpp" + +#include "hpl_ptimer.hpp" +#include "hpl_pmatgen.hpp" +#include "hpl_ptest.hpp" + +#endif +/* + * End of hpl.hpp + */ diff --git a/include/hpl_auxil.hpp b/include/hpl_auxil.hpp new file mode 100644 index 0000000..537a3d0 --- /dev/null +++ b/include/hpl_auxil.hpp @@ -0,0 +1,90 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_AUXIL_HPP +#define HPL_AUXIL_HPP +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ +#include "hpl_misc.hpp" +#include "hpl_blas.hpp" +/* + * --------------------------------------------------------------------- + * typedef definitions + * --------------------------------------------------------------------- + */ +typedef enum { + HPL_NORM_A = 800, + HPL_NORM_1 = 801, + HPL_NORM_I = 802 +} HPL_T_NORM; + +typedef enum { + HPL_MACH_EPS = 900, /* relative machine precision */ + HPL_MACH_SFMIN = 901, /* safe minimum st 1/sfmin does not overflow */ + HPL_MACH_BASE = 902, /* base = base of the machine */ + HPL_MACH_PREC = 903, /* prec = eps*base */ + HPL_MACH_MLEN = 904, /* number of (base) digits in the mantissa */ + HPL_MACH_RND = 905, /* 1.0 if rounding occurs in addition */ + HPL_MACH_EMIN = 906, /* min exponent before (gradual) underflow */ + HPL_MACH_RMIN = 907, /* underflow threshold base**(emin-1) */ + HPL_MACH_EMAX = 908, /* largest exponent before overflow */ + HPL_MACH_RMAX = 909 /* overflow threshold - (base**emax)*(1-eps) */ + +} HPL_T_MACH; +/* + * --------------------------------------------------------------------- + * Function prototypes + * --------------------------------------------------------------------- + */ +void HPL_fprintf(FILE*, const char*, ...); +void HPL_warn(FILE*, int, const char*, const char*, ...); +void HPL_abort(int, const char*, const char*, ...); + +void HPL_dlacpy(const int, + const int, + const double*, + const int, + double*, + const int); + +void HPL_dlatcpy(const int, + const int, + const double*, + const int, + double*, + const int); + +void HPL_dlatcpy_gpu(const int, + const int, + const double*, + const int, + double*, + const int); + +double HPL_dlange(const HPL_T_NORM, + const int, + const int, + const double*, + const int); + +double HPL_dlamch(const HPL_T_MACH); + +#endif +/* + * End of hpl_auxil.hpp + */ diff --git a/include/hpl_blas.hpp b/include/hpl_blas.hpp new file mode 100644 index 0000000..56dce6c --- /dev/null +++ b/include/hpl_blas.hpp @@ -0,0 +1,266 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_BLAS_HPP +#define HPL_BLAS_HPP +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ + +#include "hpl_misc.hpp" +#include +#include +#include + +extern rocblas_handle handle; +extern hipStream_t computeStream; +extern hipStream_t dataStream; + +#if __cplusplus +extern "C" { +#endif + +/* + * --------------------------------------------------------------------- + * typedef definitions + * --------------------------------------------------------------------- + */ +enum HPL_ORDER { HplRowMajor = 101, HplColumnMajor = 102 }; +enum HPL_TRANS { HplNoTrans = 111, HplTrans = 112, HplConjTrans = 113 }; +enum HPL_UPLO { HplUpper = 121, HplLower = 122 }; +enum HPL_DIAG { HplNonUnit = 131, HplUnit = 132 }; +enum HPL_SIDE { HplLeft = 141, HplRight = 142 }; + +/* + * --------------------------------------------------------------------- + * Blocked OpenMP routines + * --------------------------------------------------------------------- + */ + +void HPL_idamax_omp(const int N, + const double* X, + const int INCX, + const int NB, + const int II, + const int thread_rank, + const int thread_size, + int* max_index, + double* max_value); + +void HPL_dscal_omp(const int N, + const double ALPHA, + double* X, + const int INCX, + const int NB, + const int II, + const int thread_rank, + const int thread_size); + +void HPL_daxpy_omp(const int N, + const double ALPHA, + const double* X, + const int INCX, + double* Y, + const int INCY, + const int NB, + const int II, + const int thread_rank, + const int thread_size); + +void HPL_dger_omp(const enum HPL_ORDER ORDER, + const int M, + const int N, + const double ALPHA, + const double* X, + const int INCX, + double* Y, + const int INCY, + double* A, + const int LDA, + const int NB, + const int II, + const int thread_rank, + const int thread_size); + +void HPL_dgemv_omp(const enum HPL_ORDER ORDER, + const enum HPL_TRANS TRANS, + const int M, + const int N, + const double ALPHA, + const double* A, + const int LDA, + const double* X, + const int INCX, + const double BETA, + double* Y, + const int INCY, + const int NB, + const int II, + const int thread_rank, + const int thread_size); + +void HPL_dgemm_omp(const enum HPL_ORDER ORDER, + const enum HPL_TRANS TRANSA, + const enum HPL_TRANS TRANSB, + const int M, + const int N, + const int K, + const double ALPHA, + const double* A, + const int LDA, + const double* B, + const int LDB, + const double BETA, + double* C, + const int LDC, + const int NB, + const int II, + const int thread_rank, + const int thread_size); + +/* + * --------------------------------------------------------------------- + * #define macro constants + * --------------------------------------------------------------------- + */ +#define CBLAS_INDEX int + +#define CBLAS_ORDER HPL_ORDER +#define CblasRowMajor HplRowMajor +#define CblasColMajor HplColMajor + +#define CBLAS_TRANSPOSE HPL_TRANS +#define CblasNoTrans HplNoTrans +#define CblasTrans HplTrans +#define CblasConjTrans HplConjTrans + +#define CBLAS_UPLO HPL_UPLO +#define CblasUpper HplUpper +#define CblasLower HplLower + +#define CBLAS_DIAG HPL_DIAG +#define CblasNonUnit HplNonUnit +#define CblasUnit HplUnit + +#define CBLAS_SIDE HPL_SIDE +#define CblasLeft HplLeft +#define CblasRight HplRight +/* + * --------------------------------------------------------------------- + * CBLAS Function prototypes + * --------------------------------------------------------------------- + */ +CBLAS_INDEX cblas_idamax(const int, const double*, const int); +void cblas_dswap(const int, double*, const int, double*, const int); +void cblas_dcopy(const int, const double*, const int, double*, const int); + +void cblas_daxpy(const int, + const double, + const double*, + const int, + double*, + const int); + +void cblas_dscal(const int, const double, double*, const int); + +void cblas_dgemv(const enum CBLAS_ORDER, + const enum CBLAS_TRANSPOSE, + const int, + const int, + const double, + const double*, + const int, + const double*, + const int, + const double, + double*, + const int); + +void cblas_dger(const enum CBLAS_ORDER, + const int, + const int, + const double, + const double*, + const int, + const double*, + const int, + double*, + const int); + +void cblas_dtrsv(const enum CBLAS_ORDER, + const enum CBLAS_UPLO, + const enum CBLAS_TRANSPOSE, + const enum CBLAS_DIAG, + const int, + const double*, + const int, + double*, + const int); + +void cblas_dgemm(const enum CBLAS_ORDER, + const enum CBLAS_TRANSPOSE, + const enum CBLAS_TRANSPOSE, + const int, + const int, + const int, + const double, + const double*, + const int, + const double*, + const int, + const double, + double*, + const int); + +void cblas_dtrsm(const enum CBLAS_ORDER, + const enum CBLAS_SIDE, + const enum CBLAS_UPLO, + const enum CBLAS_TRANSPOSE, + const enum CBLAS_DIAG, + const int, + const int, + const double, + const double*, + const int, + double*, + const int); +/* + * --------------------------------------------------------------------- + * HPL C BLAS macro definition + * --------------------------------------------------------------------- + */ +#define HPL_dswap cblas_dswap +#define HPL_dcopy cblas_dcopy +#define HPL_daxpy cblas_daxpy +#define HPL_dscal cblas_dscal +#define HPL_idamax cblas_idamax + +#define HPL_dgemv cblas_dgemv +#define HPL_dtrsv cblas_dtrsv +#define HPL_dger cblas_dger + +#define HPL_dgemm cblas_dgemm +#define HPL_dtrsm cblas_dtrsm + +#if __cplusplus +} +#endif + +#endif +/* + * hpl_blas.hpp + */ diff --git a/include/hpl_comm.hpp b/include/hpl_comm.hpp new file mode 100644 index 0000000..f69c44e --- /dev/null +++ b/include/hpl_comm.hpp @@ -0,0 +1,94 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_COMM_HPP +#define HPL_COMM_HPP +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ +#include "hpl_pmisc.hpp" +#include "hpl_panel.hpp" + +/* + * --------------------------------------------------------------------- + * #typedefs and data structures + * --------------------------------------------------------------------- + */ +typedef enum { + HPL_1RING = 401, /* Unidirectional ring */ + HPL_1RING_M = 402, /* Unidirectional ring (modified) */ + HPL_2RING = 403, /* Bidirectional ring */ + HPL_2RING_M = 404, /* Bidirectional ring (modified) */ + HPL_BLONG = 405, /* long broadcast */ + HPL_BLONG_M = 406, /* long broadcast (modified) */ +} HPL_T_TOP; + +typedef MPI_Op HPL_T_OP; + +#define HPL_SUM MPI_SUM +#define HPL_MAX MPI_MAX +#define HPL_MIN MPI_MIN + +extern MPI_Op HPL_DMXSWP; +extern MPI_Datatype PDFACT_ROW; +/* + * --------------------------------------------------------------------- + * #define macro constants + * --------------------------------------------------------------------- + */ +#define HPL_FAILURE 0 +#define HPL_SUCCESS 1 +/* + * --------------------------------------------------------------------- + * comm function prototypes + * --------------------------------------------------------------------- + */ +int HPL_send(double*, int, int, int, MPI_Comm); +int HPL_recv(double*, int, int, int, MPI_Comm); +int HPL_sdrv(double*, int, int, double*, int, int, int, MPI_Comm); +int HPL_bcast(double*, int, int, MPI_Comm, HPL_T_TOP top); +int HPL_bcast_1ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM); +int HPL_bcast_1rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM); +int HPL_bcast_2ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM); +int HPL_bcast_2rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM); +int HPL_bcast_blong(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM); +int HPL_bcast_blonM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM); +int HPL_scatterv(double*, const int*, const int*, const int, int, MPI_Comm); +int HPL_allgatherv(double*, const int, const int*, const int*, MPI_Comm); +int HPL_barrier(MPI_Comm); +int HPL_broadcast(void*, const int, const HPL_T_TYPE, const int, MPI_Comm); + +int HPL_reduce(void*, + const int, + const HPL_T_TYPE, + const HPL_T_OP, + const int, + MPI_Comm); + +int HPL_all_reduce(void*, + const int, + const HPL_T_TYPE, + const HPL_T_OP, + MPI_Comm); + +void HPL_dmxswp(void*, void*, int*, MPI_Datatype*); +void HPL_all_reduce_dmxswp(double*, const int, const int, MPI_Comm, double*); + +#endif +/* + * End of hpl_comm.hpp + */ diff --git a/include/hpl_grid.hpp b/include/hpl_grid.hpp new file mode 100644 index 0000000..9952df7 --- /dev/null +++ b/include/hpl_grid.hpp @@ -0,0 +1,106 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_GRID_H +#define HPL_GRID_H +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ +#include "hpl_pmisc.hpp" + +/* + * --------------------------------------------------------------------- + * #typedefs and data structures + * --------------------------------------------------------------------- + */ +typedef enum { HPL_INT = 100, HPL_DOUBLE = 101 } HPL_T_TYPE; + +typedef enum { HPL_ROW_MAJOR = 201, HPL_COLUMN_MAJOR = 202 } HPL_T_ORDER; + +typedef struct HPL_S_grid { + MPI_Comm all_comm; /* grid communicator */ + MPI_Comm row_comm; /* row communicator */ + MPI_Comm col_comm; /* column communicator */ + HPL_T_ORDER order; /* ordering of the procs in the grid */ + int iam; /* my rank in the grid */ + int myrow; /* my row number in the grid */ + int mycol; /* my column number in the grid */ + int nprow; /* the total # of rows in the grid */ + int npcol; /* the total # of columns in the grid */ + int local_myrow; /* my row number in the node-local grid */ + int local_mycol; /* my column number in the node-local grid */ + int local_nprow; /* the total # of rows in the node-local grid */ + int local_npcol; /* the total # of columns in the node-local grid */ + int nprocs; /* the total # of procs in the grid */ + int row_ip2; /* largest power of two <= nprow */ + int row_hdim; /* row_ip2 procs hypercube dimension */ + int row_ip2m1; /* largest power of two <= nprow-1 */ + int row_mask; /* row_ip2m1 procs hypercube mask */ + int col_ip2; /* largest power of two <= npcol */ + int col_hdim; /* col_ip2 procs hypercube dimension */ + int col_ip2m1; /* largest power of two <= npcol-1 */ + int col_mask; /* col_ip2m1 procs hypercube mask */ +} HPL_T_grid; + +/* + * --------------------------------------------------------------------- + * #define macros definitions + * --------------------------------------------------------------------- + */ +#define HPL_2_MPI_TYPE(typ) ((typ == HPL_INT ? MPI_INT : MPI_DOUBLE)) +/* + * The following macros perform common modulo operations; All functions + * except MPosMod assume arguments are < d (i.e., arguments are themsel- + * ves within modulo range). + */ +/* increment with mod */ +#define MModInc(I, d) \ + if(++(I) == (d)) (I) = 0 +/* decrement with mod */ +#define MModDec(I, d) \ + if(--(I) == -1) (I) = (d)-1 +/* positive modulo */ +#define MPosMod(I, d) ((I) - ((I) / (d)) * (d)) +/* add two numbers */ +#define MModAdd(I1, I2, d) \ + (((I1) + (I2) < (d)) ? (I1) + (I2) : (I1) + (I2) - (d)) +/* add 1 to # */ +#define MModAdd1(I, d) (((I) != (d)-1) ? (I) + 1 : 0) +/* subtract two numbers */ +#define MModSub(I1, I2, d) (((I1) < (I2)) ? (d) + (I1) - (I2) : (I1) - (I2)) +/* sub 1 from # */ +#define MModSub1(I, d) (((I) != 0) ? (I)-1 : (d)-1) +/* + * --------------------------------------------------------------------- + * grid function prototypes + * --------------------------------------------------------------------- + */ +int HPL_grid_init(MPI_Comm, + const HPL_T_ORDER, + const int, + const int, + const int, + const int, + HPL_T_grid*); + +int HPL_grid_exit(HPL_T_grid*); +int HPL_grid_info(const HPL_T_grid*, int*, int*, int*, int*); + +#endif +/* + * End of hpl_grid.hpp + */ diff --git a/include/hpl_misc.hpp b/include/hpl_misc.hpp new file mode 100644 index 0000000..75bd326 --- /dev/null +++ b/include/hpl_misc.hpp @@ -0,0 +1,66 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_MISC_HPP +#define HPL_MISC_HPP +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include + +/* + * --------------------------------------------------------------------- + * #define macro constants + * --------------------------------------------------------------------- + */ +#define HPL_rone 1.0 +#define HPL_rtwo 2.0 +#define HPL_rzero 0.0 +/* + * --------------------------------------------------------------------- + * #define macros definitions + * --------------------------------------------------------------------- + */ +#define Mabs(a_) (((a_) < 0) ? -(a_) : (a_)) +#define Mmin(a_, b_) (((a_) < (b_)) ? (a_) : (b_)) +#define Mmax(a_, b_) (((a_) > (b_)) ? (a_) : (b_)) + +#define Mfloor(a, b) (((a) > 0) ? (((a) / (b))) : (-(((-(a)) + (b)-1) / (b)))) +#define Mceil(a, b) (((a) + (b)-1) / (b)) +#define Miceil(a, b) (((a) > 0) ? ((((a) + (b)-1) / (b))) : (-((-(a)) / (b)))) + +#define Mupcase(C) (((C) > 96 && (C) < 123) ? (C)&0xDF : (C)) +#define Mlowcase(C) (((C) > 64 && (C) < 91) ? (C) | 32 : (C)) +/* + * Mptr returns a pointer to a_( i_, j_ ) for readability reasons and + * also less silly errors ... + */ +#define Mptr(a_, i_, j_, lda_) \ + ((a_) + (size_t)(i_) + (size_t)(j_) * (size_t)(lda_)) +/* + * Align pointer + */ +#define HPL_PTR(ptr_, al_) ((((size_t)(ptr_) + (al_)-1) / (al_)) * (al_)) +#endif +/* + * End of hpl_misc.hpp + */ diff --git a/include/hpl_panel.hpp b/include/hpl_panel.hpp new file mode 100644 index 0000000..a8e5f44 --- /dev/null +++ b/include/hpl_panel.hpp @@ -0,0 +1,146 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_PANEL_HPP +#define HPL_PANEL_HPP +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ +#include "hpl_pmisc.hpp" +#include "hpl_grid.hpp" + +/* + * --------------------------------------------------------------------- + * Data Structures + * --------------------------------------------------------------------- + */ +typedef struct HPL_S_panel { + struct HPL_S_grid* grid; /* ptr to the process grid */ + struct HPL_S_palg* algo; /* ptr to the algo parameters */ + struct HPL_S_pmat* pmat; /* ptr to the local array info */ + double* A; /* ptr to trailing part of A */ + double* dA; /* ptr to trailing part of A */ + double* LWORK; /* L work space */ + double* dLWORK; /* L device-copy work space */ + double* UWORK; /* U work space */ + double* dUWORK; /* U device-copy work space */ + double* fWORK; /* pdfact work space */ + double* L2; /* ptr to L */ + double* L1; /* ptr to jb x jb upper block of A */ + double* dL2; /* ptr to L */ + double* dL1; /* ptr to jb x jb upper block of A */ + double* DINFO; /* ptr to replicated scalar info */ + double* dDINFO; /* ptr to replicated scalar info */ + int* ipiv; + int* dipiv; + int* lindxA; + int* dlindxA; + int* lindxAU; + int* dlindxAU; + int* lindxU; + int* dlindxU; + int* permU; + int* dpermU; + double* U; /* ptr to U */ + double* dU; /* ptr to U */ + double* W; /* ptr to W */ + double* dW; /* ptr to W */ + double* U1; /* ptr to U1 */ + double* dU1; /* ptr to U1 */ + double* W1; /* ptr to W1 */ + double* dW1; /* ptr to W1 */ + double* U2; /* ptr to U2 */ + double* dU2; /* ptr to U2 */ + double* W2; /* ptr to W2 */ + double* dW2; /* ptr to W2 */ + int nu0; + int nu1; + int nu2; + int ldu0; + int ldu1; + int ldu2; + int* IWORK; /* integer workspace for swapping */ + void* buffers[2]; /* buffers for panel bcast */ + int counts[2]; /* counts for panel bcast */ + MPI_Datatype dtypes[2]; /* data types for panel bcast */ + MPI_Request request[1]; /* requests for panel bcast */ + MPI_Status status[1]; /* status for panel bcast */ + int nb; /* distribution blocking factor */ + int jb; /* panel width */ + int m; /* global # of rows of trailing part of A */ + int n; /* global # of cols of trailing part of A */ + int ia; /* global row index of trailing part of A */ + int ja; /* global col index of trailing part of A */ + int mp; /* local # of rows of trailing part of A */ + int nq; /* local # of cols of trailing part of A */ + int ii; /* local row index of trailing part of A */ + int jj; /* local col index of trailing part of A */ + int lda; /* local leading dim of array A */ + int dlda; /* local leading dim of array A */ + int prow; /* proc. row owning 1st row of trail. A */ + int pcol; /* proc. col owning 1st col of trail. A */ + int msgid; /* message id for panel bcast */ + int ldl2; /* local leading dim of array L2 */ + int dldl2; /* local leading dim of array L2 */ + int len; /* length of the buffer to broadcast */ + unsigned int max_pinned_work_size; /* largest size of pinned A space */ + unsigned int max_lwork_size; /* largest size of WORK space */ + unsigned int max_uwork_size; /* largest size of WORK space */ + unsigned int max_iwork_size; /* largest size of IWORK space */ + unsigned int max_fwork_size; /* largest size of fWORK space */ + unsigned int free_work_now; /* should we deallocate */ +} HPL_T_panel; + +/* + * --------------------------------------------------------------------- + * panel function prototypes + * --------------------------------------------------------------------- + */ +#include "hpl_pgesv.hpp" + +void HPL_pdpanel_new(HPL_T_grid*, + HPL_T_palg*, + const int, + const int, + const int, + HPL_T_pmat*, + const int, + const int, + const int, + HPL_T_panel**); + +void HPL_pdpanel_init(HPL_T_grid*, + HPL_T_palg*, + const int, + const int, + const int, + HPL_T_pmat*, + const int, + const int, + const int, + HPL_T_panel*); + +int HPL_pdpanel_disp(HPL_T_panel**); +int HPL_pdpanel_free(HPL_T_panel*); +void HPL_pdpanel_SendToHost(HPL_T_panel*); +void HPL_pdpanel_SendToDevice(HPL_T_panel*); +void HPL_pdpanel_Wait(HPL_T_panel* PANEL); +int HPL_pdpanel_bcast(HPL_T_panel*); +#endif +/* + * End of hpl_panel.hpp + */ diff --git a/include/hpl_pauxil.hpp b/include/hpl_pauxil.hpp new file mode 100644 index 0000000..7730000 --- /dev/null +++ b/include/hpl_pauxil.hpp @@ -0,0 +1,287 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_PAUXIL_HPP +#define HPL_PAUXIL_HPP +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ +#include "hpl_misc.hpp" +#include "hpl_blas.hpp" +#include "hpl_auxil.hpp" + +#include "hpl_pmisc.hpp" +#include "hpl_grid.hpp" + +/* + * --------------------------------------------------------------------- + * #define macros definitions + * --------------------------------------------------------------------- + */ +/* + * Mindxg2p returns the process coodinate owning the entry globally in- + * dexed by ig_. + */ +#define Mindxg2p(ig_, inb_, nb_, proc_, src_, nprocs_) \ + { \ + if(((ig_) >= (inb_)) && ((src_) >= 0) && ((nprocs_) > 1)) { \ + proc_ = (src_) + 1 + ((ig_) - (inb_)) / (nb_); \ + proc_ -= (proc_ / (nprocs_)) * (nprocs_); \ + } else { \ + proc_ = (src_); \ + } \ + } + +#define Mindxg2l(il_, ig_, inb_, nb_, proc_, src_, nprocs_) \ + { \ + if(((ig_) < (inb_)) || ((src_) == -1) || ((nprocs_) == 1)) { \ + il_ = (ig_); \ + } else { \ + int i__, j__; \ + j__ = (i__ = ((ig_) - (inb_)) / (nb_)) / (nprocs_); \ + il_ = (nb_) * (j__ - i__) + \ + ((i__ + 1 - (j__ + 1) * (nprocs_)) ? (ig_) - (inb_) : (ig_)); \ + } \ + } + +#define Mindxg2lp(il_, proc_, ig_, inb_, nb_, src_, nprocs_) \ + { \ + if(((ig_) < (inb_)) || ((src_) == -1) || ((nprocs_) == 1)) { \ + il_ = (ig_); \ + proc_ = (src_); \ + } else { \ + int i__, j__; \ + j__ = (i__ = ((ig_) - (inb_)) / (nb_)) / (nprocs_); \ + il_ = (nb_) * (j__ - i__) + \ + ((i__ + 1 - (j__ + 1) * (nprocs_)) ? (ig_) - (inb_) : (ig_)); \ + proc_ = (src_) + 1 + i__; \ + proc_ -= (proc_ / (nprocs_)) * (nprocs_); \ + } \ + } +/* + * Mindxl2g computes the global index ig_ corresponding to the local + * index il_ in process proc_. + */ +#define Mindxl2g(ig_, il_, inb_, nb_, proc_, src_, nprocs_) \ + { \ + if(((src_) >= 0) && ((nprocs_) > 1)) { \ + if((proc_) == (src_)) { \ + if((il_) < (inb_)) \ + ig_ = (il_); \ + else \ + ig_ = \ + (il_) + (nb_) * ((nprocs_)-1) * (((il_) - (inb_)) / (nb_) + 1); \ + } else if((proc_) < (src_)) { \ + ig_ = (il_) + (inb_) + \ + (nb_) * (((nprocs_)-1) * ((il_) / (nb_)) + (proc_) - (src_)-1 + \ + (nprocs_)); \ + } else { \ + ig_ = (il_) + (inb_) + \ + (nb_) * (((nprocs_)-1) * ((il_) / (nb_)) + (proc_) - (src_)-1); \ + } \ + } else { \ + ig_ = (il_); \ + } \ + } +/* + * MnumrocI computes the # of local indexes np_ residing in the process + * of coordinate proc_ corresponding to the interval of global indexes + * i_:i_+n_-1 assuming that the global index 0 resides in the process + * src_, and that the indexes are distributed from src_ using the para- + * meters inb_, nb_ and nprocs_. + */ +#define MnumrocI(np_, n_, i_, inb_, nb_, proc_, src_, nprocs_) \ + { \ + if(((src_) >= 0) && ((nprocs_) > 1)) { \ + int inb__, mydist__, n__, nblk__, quot__, src__; \ + if((inb__ = (inb_) - (i_)) <= 0) { \ + nblk__ = (-inb__) / (nb_) + 1; \ + src__ = (src_) + nblk__; \ + src__ -= (src__ / (nprocs_)) * (nprocs_); \ + inb__ += nblk__ * (nb_); \ + if((n__ = (n_)-inb__) <= 0) { \ + if((proc_) == src__) \ + np_ = (n_); \ + else \ + np_ = 0; \ + } else { \ + if((mydist__ = (proc_)-src__) < 0) mydist__ += (nprocs_); \ + nblk__ = n__ / (nb_) + 1; \ + mydist__ -= nblk__ - (quot__ = (nblk__ / (nprocs_))) * (nprocs_); \ + if(mydist__ < 0) { \ + if((proc_) != src__) \ + np_ = (nb_) + (nb_)*quot__; \ + else \ + np_ = inb__ + (nb_)*quot__; \ + } else if(mydist__ > 0) { \ + np_ = (nb_)*quot__; \ + } else { \ + if((proc_) != src__) \ + np_ = n__ + (nb_) + (nb_) * (quot__ - nblk__); \ + else \ + np_ = (n_) + (nb_) * (quot__ - nblk__); \ + } \ + } \ + } else { \ + if((n__ = (n_)-inb__) <= 0) { \ + if((proc_) == (src_)) \ + np_ = (n_); \ + else \ + np_ = 0; \ + } else { \ + if((mydist__ = (proc_) - (src_)) < 0) mydist__ += (nprocs_); \ + nblk__ = n__ / (nb_) + 1; \ + mydist__ -= nblk__ - (quot__ = (nblk__ / (nprocs_))) * (nprocs_); \ + if(mydist__ < 0) { \ + if((proc_) != (src_)) \ + np_ = (nb_) + (nb_)*quot__; \ + else \ + np_ = inb__ + (nb_)*quot__; \ + } else if(mydist__ > 0) { \ + np_ = (nb_)*quot__; \ + } else { \ + if((proc_) != (src_)) \ + np_ = n__ + (nb_) + (nb_) * (quot__ - nblk__); \ + else \ + np_ = (n_) + (nb_) * (quot__ - nblk__); \ + } \ + } \ + } \ + } else { \ + np_ = (n_); \ + } \ + } + +#define Mnumroc(np_, n_, inb_, nb_, proc_, src_, nprocs_) \ + MnumrocI(np_, n_, 0, inb_, nb_, proc_, src_, nprocs_) +/* + * --------------------------------------------------------------------- + * Function prototypes + * --------------------------------------------------------------------- + */ +void HPL_indxg2lp(int*, + int*, + const int, + const int, + const int, + const int, + const int); + +int HPL_indxg2l(const int, const int, const int, const int, const int); +int HPL_indxg2p(const int, const int, const int, const int, const int); + +int HPL_indxl2g(const int, + const int, + const int, + const int, + const int, + const int); + +void HPL_infog2l(int, + int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + const int, + int*, + int*, + int*, + int*); + +int HPL_numroc(const int, + const int, + const int, + const int, + const int, + const int); + +int HPL_numrocI(const int, + const int, + const int, + const int, + const int, + const int, + const int); + +void HPL_dlaswp00N(const int, const int, double*, const int, const int*); + +void HPL_dlaswp01T(const int, + const int, + double*, + const int, + double*, + const int, + const int*); + +void HPL_dlaswp02T(const int, + const int, + double*, + const int, + const int*, + const int*); + +void HPL_dlaswp03T(const int, + const int, + double*, + const int, + double*, + const int, + const int*); + +void HPL_dlaswp04T(const int, + const int, + double*, + const int, + double*, + const int, + const int*); + +void HPL_dlaswp10N(const int, const int, double*, const int, const int*); + +void HPL_pabort(int, const char*, const char*, ...); +void HPL_pwarn(FILE*, int, const char*, const char*, ...); + +void HPL_pdlaprnt(const HPL_T_grid*, + const int, + const int, + const int, + double*, + const int, + const int, + const int, + const char*); + +double HPL_pdlamch(MPI_Comm, const HPL_T_MACH); + +double HPL_pdlange(const HPL_T_grid*, + const HPL_T_NORM, + const int, + const int, + const int, + const double*, + const int); + +#endif +/* + * End of hpl_pauxil.hpp + */ diff --git a/include/hpl_pfact.hpp b/include/hpl_pfact.hpp new file mode 100644 index 0000000..edbd076 --- /dev/null +++ b/include/hpl_pfact.hpp @@ -0,0 +1,199 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_PFACT_HPP +#define HPL_PFACT_HPP +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ +#include "hpl_misc.hpp" +#include "hpl_blas.hpp" + +#include "hpl_pgesv.hpp" +#include "hpl_pmisc.hpp" +#include "hpl_pauxil.hpp" +#include "hpl_panel.hpp" + +/* + * --------------------------------------------------------------------- + * #typedefs and data structures + * --------------------------------------------------------------------- + */ +typedef void (*HPL_T_PFA_FUN)(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +typedef void (*HPL_T_RFA_FUN)(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); +/* + * --------------------------------------------------------------------- + * Function prototypes + * --------------------------------------------------------------------- + */ +void HPL_dlocmax(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + int*, + double*); + +void HPL_dlocswpN(HPL_T_panel*, const int, const int, double*); +void HPL_dlocswpT(HPL_T_panel*, const int, const int, double*); +void HPL_pdmxswp(HPL_T_panel*, const int, const int, const int, double*); + +void HPL_pdpancrN(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +void HPL_pdpancrT(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +void HPL_pdpanllN(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +void HPL_pdpanllT(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +void HPL_pdpanrlN(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +void HPL_pdpanrlT(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +void HPL_pdrpancrN(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +void HPL_pdrpancrT(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +void HPL_pdrpanllN(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +void HPL_pdrpanllT(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +void HPL_pdrpanrlN(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +void HPL_pdrpanrlT(HPL_T_panel*, + const int, + const int, + const int, + double*, + int, + int, + double*, + int*); + +void HPL_pdfact(HPL_T_panel*); + +#endif +/* + * End of hpl_pfact.hpp + */ diff --git a/include/hpl_pgesv.hpp b/include/hpl_pgesv.hpp new file mode 100644 index 0000000..1acf860 --- /dev/null +++ b/include/hpl_pgesv.hpp @@ -0,0 +1,150 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_PGESV_HPP +#define HPL_PGESV_HPP +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ +#include "hpl_misc.hpp" +#include "hpl_blas.hpp" +#include "hpl_auxil.hpp" + +#include "hpl_pmisc.hpp" +#include "hpl_grid.hpp" +#include "hpl_comm.hpp" +#include "hpl_pauxil.hpp" +#include "hpl_panel.hpp" +#include "hpl_pfact.hpp" + +/* + * --------------------------------------------------------------------- + * #typedefs and data structures + * --------------------------------------------------------------------- + */ +typedef enum { + HPL_LEFT_LOOKING = 301, /* Left looking lu fact variant */ + HPL_CROUT = 302, /* Crout lu fact variant */ + HPL_RIGHT_LOOKING = 303 /* Right looking lu fact variant */ +} HPL_T_FACT; + +typedef enum { + HPL_SWAP00 = 451, /* Use HPL_pdlaswp00 */ + HPL_SWAP01 = 452, /* Use HPL_pdlaswp01 */ + HPL_SW_MIX = 453, /* Use HPL_pdlaswp00_ for small number of */ + /* columns, and HPL_pdlaswp01_ otherwise. */ + HPL_NO_SWP = 499 +} HPL_T_SWAP; + +typedef enum { + HPL_LOOK_AHEAD = 0, /* look-ahead update */ + HPL_UPD_1 = 1, /* first update */ + HPL_UPD_2 = 2, /* second update */ + + HPL_N_UPD = 3 +} HPL_T_UPD; + +typedef void (*HPL_T_UPD_FUN)(HPL_T_panel*, const HPL_T_UPD); + +typedef struct HPL_S_palg { + HPL_T_TOP btopo; /* row broadcast topology */ + int depth; /* look-ahead depth */ + int nbdiv; /* recursive division factor */ + int nbmin; /* recursion stopping criterium */ + HPL_T_FACT pfact; /* panel fact variant */ + HPL_T_FACT rfact; /* recursive fact variant */ + HPL_T_PFA_FUN pffun; /* panel fact function ptr */ + HPL_T_RFA_FUN rffun; /* recursive fact function ptr */ + HPL_T_UPD_FUN upfun; /* update function */ + HPL_T_SWAP fswap; /* Swapping algorithm */ + int fsthr; /* Swapping threshold */ + int equil; /* Equilibration */ + int align; /* data alignment constant */ + double frac; /* update split percentage */ +} HPL_T_palg; + +typedef struct HPL_S_pmat { + double* dA; /* pointer to local piece of A */ + double* dX; /* pointer to solution vector */ + int n; /* global problem size */ + int nb; /* blocking factor */ + int ld; /* local leading dimension */ + int mp; /* local number of rows */ + int nq; /* local number of columns */ + int info; /* computational flag */ + double* A; + double* W; + double* dW; +} HPL_T_pmat; + +extern hipEvent_t swapStartEvent[HPL_N_UPD], update[HPL_N_UPD]; +extern hipEvent_t swapUCopyEvent[HPL_N_UPD], swapWCopyEvent[HPL_N_UPD]; +extern hipEvent_t dgemmStart[HPL_N_UPD], dgemmStop[HPL_N_UPD]; + +/* + * --------------------------------------------------------------------- + * #define macro constants + * --------------------------------------------------------------------- + */ +#define MSGID_BEGIN_PFACT 1001 /* message id ranges */ +#define MSGID_END_PFACT 2000 +#define MSGID_BEGIN_FACT 2001 +#define MSGID_END_FACT 3000 +#define MSGID_BEGIN_PTRSV 3001 +#define MSGID_END_PTRSV 4000 + +#define MSGID_BEGIN_COLL 9001 +#define MSGID_END_COLL 10000 +/* + * --------------------------------------------------------------------- + * #define macros definitions + * --------------------------------------------------------------------- + */ +#define MNxtMgid(id_, beg_, end_) (((id_) + 1 > (end_) ? (beg_) : (id_) + 1)) +/* + * --------------------------------------------------------------------- + * Function prototypes + * --------------------------------------------------------------------- + */ + +void HPL_pipid(HPL_T_panel*, int*, int*); +void HPL_piplen(HPL_T_panel*, const int, const int*, int*, int*); +void HPL_perm(const int, int*, int*, int*); + +void HPL_plindx(HPL_T_panel*, + const int, + const int*, + int*, + int*, + int*, + int*, + int*, + int*, + int*); + +void HPL_pdlaswp_start(HPL_T_panel* PANEL, const HPL_T_UPD UPD); +void HPL_pdlaswp_exchange(HPL_T_panel* PANEL, const HPL_T_UPD UPD); +void HPL_pdlaswp_end(HPL_T_panel* PANEL, const HPL_T_UPD UPD); +void HPL_pdupdateNT(HPL_T_panel*, const HPL_T_UPD); +void HPL_pdupdateTT(HPL_T_panel*, const HPL_T_UPD); +void HPL_pdgesv(HPL_T_grid*, HPL_T_palg*, HPL_T_pmat*); +void HPL_pdtrsv(HPL_T_grid*, HPL_T_pmat*); + +#endif +/* + * End of hpl_pgesv.hpp + */ diff --git a/include/hpl_pmatgen.hpp b/include/hpl_pmatgen.hpp new file mode 100644 index 0000000..603217f --- /dev/null +++ b/include/hpl_pmatgen.hpp @@ -0,0 +1,73 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_PMATGEN_HPP +#define HPL_PMATGEN_HPP +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ +#include "hpl_misc.hpp" + +#include "hpl_pmisc.hpp" +#include "hpl_pauxil.hpp" +#include "hpl_pgesv.hpp" +#include "hpl_ptest.hpp" + +/* + * --------------------------------------------------------------------- + * #define macro constants + * --------------------------------------------------------------------- + */ +#define HPL_MULT 6364136223846793005UL +#define HPL_IADD 1UL +#define HPL_DIVFAC 2147483648.0 +#define HPL_POW16 65536.0 +#define HPL_HALF 0.5 +/* + * --------------------------------------------------------------------- + * Function prototypes + * --------------------------------------------------------------------- + */ +void HPL_xjumpm(const int JUMPM, + const uint64_t MULT, + const uint64_t IADD, + const uint64_t IRANN, + uint64_t& IRANM, + uint64_t& IAM, + uint64_t& ICM); + +void HPL_pdrandmat(const HPL_T_grid*, + const int, + const int, + const int, + double*, + const int, + const int); + +int HPL_pdmatgen(HPL_T_test*, + HPL_T_grid*, + HPL_T_palg*, + HPL_T_pmat*, + const int, + const int); + +void HPL_pdmatfree(HPL_T_pmat*); + +#endif +/* + * End of hpl_pmatgen.hpp + */ diff --git a/include/hpl_pmisc.hpp b/include/hpl_pmisc.hpp new file mode 100644 index 0000000..883d9c0 --- /dev/null +++ b/include/hpl_pmisc.hpp @@ -0,0 +1,29 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_PMISC_HPP +#define HPL_PMISC_HPP +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ +#include "hpl_misc.hpp" +#include "mpi.h" + +#endif +/* + * End of hpl_pmisc.hpp + */ diff --git a/include/hpl_ptest.hpp b/include/hpl_ptest.hpp new file mode 100644 index 0000000..f3e93f9 --- /dev/null +++ b/include/hpl_ptest.hpp @@ -0,0 +1,118 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_PTEST_HPP +#define HPL_PTEST_HPP +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ +#include "hpl_misc.hpp" +#include "hpl_blas.hpp" +#include "hpl_auxil.hpp" + +#include "hpl_pmisc.hpp" +#include "hpl_pauxil.hpp" +#include "hpl_panel.hpp" +#include "hpl_pgesv.hpp" + +#include "hpl_ptimer.hpp" +#include "hpl_pmatgen.hpp" + +/* + * --------------------------------------------------------------------- + * Data Structures + * --------------------------------------------------------------------- + */ +typedef struct HPL_S_test { + double epsil; /* epsilon machine */ + double thrsh; /* threshold */ + FILE* outfp; /* output stream (only in proc 0) */ + int kfail; /* # of tests failed */ + int kpass; /* # of tests passed */ + int kskip; /* # of tests skipped */ + int ktest; /* total number of tests */ +} HPL_T_test; + +/* + * --------------------------------------------------------------------- + * #define macro constants for testing only + * --------------------------------------------------------------------- + */ +#define HPL_LINE_MAX 256 +#define HPL_MAX_PARAM 20 +#define HPL_ISEED 100 +/* + * --------------------------------------------------------------------- + * global timers for timing analysis only + * --------------------------------------------------------------------- + */ +#define HPL_TIMING_BEG 11 /* timer 0 reserved, used by main */ +#define HPL_TIMING_N 8 /* number of timers defined below */ +#define HPL_TIMING_RPFACT 11 /* starting from here, contiguous */ +#define HPL_TIMING_PFACT 12 +#define HPL_TIMING_MXSWP 13 +#define HPL_TIMING_COPY 14 +#define HPL_TIMING_LBCAST 15 +#define HPL_TIMING_LASWP 16 +#define HPL_TIMING_UPDATE 17 +#define HPL_TIMING_PTRSV 18 +/* + * --------------------------------------------------------------------- + * Function prototypes + * --------------------------------------------------------------------- + */ +void HPL_pdinfo(int ARGC, + char** ARGV, + HPL_T_test*, + int*, + int*, + int*, + int*, + HPL_T_ORDER*, + int*, + int*, + int*, + int*, + int*, + int*, + HPL_T_FACT*, + int*, + int*, + int*, + int*, + int*, + HPL_T_FACT*, + int*, + HPL_T_TOP*, + int*, + int*, + HPL_T_SWAP*, + int*, + int*, + int*, + int*, + int*, + double*); + +void HPL_pdtest(HPL_T_test*, HPL_T_grid*, HPL_T_palg*, const int, const int); +void HPL_InitGPU(const HPL_T_grid* GRID); +void HPL_FreeGPU(); + +#endif +/* + * End of hpl_ptest.hpp + */ diff --git a/include/hpl_ptimer.hpp b/include/hpl_ptimer.hpp new file mode 100644 index 0000000..b6ebf98 --- /dev/null +++ b/include/hpl_ptimer.hpp @@ -0,0 +1,71 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#ifndef HPL_PTIMER_HPP +#define HPL_PTIMER_HPP +/* + * --------------------------------------------------------------------- + * Include files + * --------------------------------------------------------------------- + */ +#include "hpl_pmisc.hpp" + +/* + * --------------------------------------------------------------------- + * #define macro constants + * --------------------------------------------------------------------- + */ +#define HPL_NPTIMER 64 +#define HPL_PTIMER_STARTFLAG 5.0 +#define HPL_PTIMER_ERROR -1.0 +/* + * --------------------------------------------------------------------- + * type definitions + * --------------------------------------------------------------------- + */ +typedef enum { HPL_WALL_PTIME = 101, HPL_CPU_PTIME = 102 } HPL_T_PTIME; + +typedef enum { + HPL_AMAX_PTIME = 201, + HPL_AMIN_PTIME = 202, + HPL_SUM_PTIME = 203 +} HPL_T_PTIME_OP; +/* + * --------------------------------------------------------------------- + * Function prototypes + * --------------------------------------------------------------------- + */ +double HPL_ptimer_cputime(void); +double HPL_ptimer_walltime(void); +void HPL_ptimer(const int); +void HPL_ptimer_boot(void); + +void HPL_ptimer_combine(MPI_Comm comm, + const HPL_T_PTIME_OP, + const HPL_T_PTIME, + const int, + const int, + double*); + +void HPL_ptimer_disable(void); +void HPL_ptimer_enable(void); +double HPL_ptimer_inquire(const HPL_T_PTIME, const int); +void HPL_ptimer_stepReset(const int, const int); +double HPL_ptimer_getStep(const int); + +#endif +/* + * End of hpl_ptimer.hpp + */ diff --git a/include/hpl_version.hpp.in b/include/hpl_version.hpp.in new file mode 100644 index 0000000..08cbb5f --- /dev/null +++ b/include/hpl_version.hpp.in @@ -0,0 +1,24 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#ifndef HPL_VERSION_HPP +#define HPL_VERSION_HPP + +// clang-format off +#define __ROCHPL_VER_MAJOR @rochpl_VERSION_MAJOR@ +#define __ROCHPL_VER_MINOR @rochpl_VERSION_MINOR@ +#define __ROCHPL_VER_PATCH @rochpl_VERSION_PATCH@ +#define __ROCHPL_VER_TWEAK @rochpl_VERSION_TWEAK@ +// clang-format on + +#define __ROCHPL_VER \ + 10000 * __ROCHPL_VER_MAJOR + 100 * __ROCHPL_VER_MINOR + __ROCHPL_VER_PATCH + +#endif // VERSION_HPP diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..4409f22 --- /dev/null +++ b/install.sh @@ -0,0 +1,390 @@ +#!/usr/bin/env bash +# Author: Nico Trost +# Modified by: Noel Chalmers + +#set -x #echo on + +# ################################################# +# helper functions +# ################################################# +function display_help() +{ + echo "rocHPL build helper script" + echo "./install " + echo " [-h|--help] prints this help message" + echo " [-g|--debug] Set build type to Debug (otherwise build Release)" + echo " [--prefix] Path to rocHPL install location (Default: build/rocHPL)" + echo " [--with-rocm=

] Path to ROCm install (Default: /opt/rocm)" + echo " [--with-rocblas=] Path to rocBLAS library (Default: /opt/rocm/rocblas)" + echo " [--with-cpublas=] Path to external CPU BLAS library (Default: clone+build BLIS)" + echo " [--with-mpi=] Path to external MPI install (Default: clone+build OpenMPI)" + echo " [--verbose-print] Verbose output during HPL setup (Default: true)" + echo " [--progress-report] Print progress report to terminal during HPL run (Default: true)" + echo " [--detailed-timing] Record detailed timers during HPL run (Default: true)" +} + +# prereq: ${ID} must be defined before calling +supported_distro( ) +{ + if [ -z ${ID+foo} ]; then + printf "supported_distro(): \$ID must be set\n" + exit 2 + fi + + case "${ID}" in + ubuntu|centos|rhel|fedora|sles) + true + ;; + *) printf "This script is currently supported on Ubuntu, CentOS, RHEL, Fedora and SLES\n" + exit 2 + ;; + esac +} + +exit_with_error( ) +{ + if (( $1 == 2 )); then + # Failure in some install step + # Print some message about needed dependencies + + # dependencies needed for executable to build + local library_dependencies_ubuntu=( "git" "make" "cmake" "libnuma-dev" "pkg-config" "autoconf" "libtool" "automake" "m4" "flex" "libgomp1") + local library_dependencies_centos=( "git" "make" "cmake3" "gcc-c++" "rpm-build" "epel-release" "numactl-libs" "autoconf" "libtool" "automake" "m4" "flex" "libgomp") + local library_dependencies_fedora=( "git" "make" "cmake" "gcc-c++" "libcxx-devel" "rpm-build" "numactl-libs" "autoconf" "libtool" "automake" "m4" "flex" "libgomp") + local library_dependencies_sles=( "git" "make" "cmake" "gcc-c++" "libcxxtools9" "rpm-build" "libnuma-devel" "autoconf" "libtool" "automake" "m4" "flex" "libgomp1") + + if [[ "${with_rocm}" == /opt/rocm ]]; then + library_dependencies_ubuntu+=("rocblas" "rocblas-dev") + library_dependencies_centos+=("rocblas" "rocblas-devel") + library_dependencies_fedora+=("rocblas" "rocblas-dev") + library_dependencies_sles+=("rocblas" "rocblas-devel") + fi + + printf "Installation failed. Some required packages may be missing.\n" + printf "The following package manager install command may be needed:\n" + case "${ID}" in + ubuntu) + printf "sudo apt install -y ${library_dependencies_ubuntu[*]}\n" + ;; + + centos|rhel) + printf "sudo yum -y --nogpgcheck install ${library_dependencies_centos[*]}\n" + ;; + + fedora) + printf "sudo dnf install -y ${library_dependencies_fedora[*]}\n" + ;; + + sles) + printf "sudo zypper -n --no-gpg-checks install ${library_dependencies_sles[*]}\n" + ;; + *) + exit 2 + ;; + esac + fi + + exit $1 +} + +check_exit_code( ) +{ + if (( $? != 0 )); then + exit $@ + fi +} + + +# Install BLIS in rochpl/tpl +install_blis( ) +{ + if [ ! -d "./tpl/blis" ]; then + mkdir -p tpl && cd tpl + git clone https://github.com/amd/blis --branch 3.1 + check_exit_code 2 + cd blis; ./configure --prefix=${PWD} --enable-cblas --disable-sup-handling auto; + check_exit_code 2 + make -j$(nproc) + check_exit_code 2 + make install -j$(nproc) + check_exit_code 2 + cd ../.. + elif [ ! -f "./tpl/blis/lib/libblis.so" ]; then + cd tpl/blis; ./configure --prefix=${PWD} --enable-cblas --disable-sup-handling auto; + check_exit_code 2 + make -j$(nproc) + check_exit_code 2 + make install -j$(nproc) + check_exit_code 2 + cd ../.. + fi + + # Check for successful build + if [ ! -f "./tpl/blis/lib/libblis.so" ]; then + echo "Error: BLIS install unsuccessful." + exit_with_error 2 + fi +} + +# Clone and build OpenMPI+UCX in rochpl/tpl +install_openmpi( ) +{ + #OpenMPI and UCX install to one of these locations depending on OS + ucx_lib_folder=./tpl/ucx/lib + ompi_lib_folder=./tpl/openmpi/lib + ucx_lib64_folder=./tpl/ucx/lib64 + ompi_lib64_folder=./tpl/openmpi/lib64 + + if [ ! -d "./tpl/ucx" ]; then + mkdir -p tpl && cd tpl + git clone --branch master https://github.com/openucx/ucx.git ucx + check_exit_code 2 + cd ucx; + git checkout b38c71e94ccbbafbaa308f04ad2539425f345483 + ./autogen.sh; ./autogen.sh #why do we have to run this twice? + check_exit_code 2 + mkdir build; cd build + ../contrib/configure-opt --prefix=${PWD}/../ --with-rocm=${with_rocm} --without-knem --without-cuda --without-java + check_exit_code 2 + make -j$(nproc) + check_exit_code 2 + make install + check_exit_code 2 + cd ../../.. + elif ([ ! -f "${ucx_lib_folder}/libucm.so" ] || [ ! -f "${ucx_lib_folder}/libucp.so" ] || \ + [ ! -f "${ucx_lib_folder}/libucs.so" ] || [ ! -f "${ucx_lib_folder}/libuct.so" ]) && \ + ([ ! -f "${ucx_lib64_folder}/libucm.so" ] || [ ! -f "${ucx_lib64_folder}/libucp.so" ] || \ + [ ! -f "${ucx_lib64_folder}/libucs.so" ] || [ ! -f "${ucx_lib64_folder}/libuct.so" ]); then + cd tpl/ucx; + git checkout b38c71e94ccbbafbaa308f04ad2539425f345483 + ./autogen.sh; ./autogen.sh + check_exit_code 2 + mkdir build; cd build + ../contrib/configure-opt --prefix=${PWD}/../ --with-rocm=${with_rocm} --without-knem --without-cuda --without-java + check_exit_code 2 + make -j$(nproc) + check_exit_code 2 + make install + check_exit_code 2 + cd ../../.. + fi + + # Check for successful build + if ([ ! -f "${ucx_lib_folder}/libucm.so" ] || [ ! -f "${ucx_lib_folder}/libucp.so" ] || \ + [ ! -f "${ucx_lib_folder}/libucs.so" ] || [ ! -f "${ucx_lib_folder}/libuct.so" ]) && + ([ ! -f "${ucx_lib64_folder}/libucm.so" ] || [ ! -f "${ucx_lib64_folder}/libucp.so" ] || \ + [ ! -f "${ucx_lib64_folder}/libucs.so" ] || [ ! -f "${ucx_lib64_folder}/libuct.so" ]); then + echo "Error: UCX install unsuccessful." + exit 3 + fi + + if [ ! -d "./tpl/openmpi" ]; then + mkdir -p tpl && cd tpl + git clone --branch v4.1.4 https://github.com/open-mpi/ompi.git openmpi + check_exit_code 2 + cd openmpi; ./autogen.pl; + check_exit_code 2 + mkdir build; cd build + ../configure --prefix=${PWD}/../ --with-ucx=${PWD}/../../ucx --without-verbs + check_exit_code 2 + make -j$(nproc) + check_exit_code 2 + make install + check_exit_code 2 + cd ../../.. + elif [ ! -f "${ompi_lib_folder}/libmpi.so" ] && [ ! -f "${ompi_lib64_folder}/libmpi.so" ]; then + cd tpl/openmpi; ./autogen.pl; + check_exit_code 2 + mkdir build; cd build + ../configure --prefix=${PWD}/../ --with-ucx=${PWD}/../../ucx --without-verbs + check_exit_code 2 + make -j$(nproc) + check_exit_code 2 + make install + check_exit_code 2 + cd ../../.. + fi + + # Check for successful build + if [ ! -f "${ompi_lib_folder}/libmpi.so" ] && [ ! -f "${ompi_lib64_folder}/libmpi.so" ]; then + echo "Error: OpenMPI install unsuccessful." + exit_with_error 2 + fi +} + +# ################################################# +# Pre-requisites check +# ################################################# +# Exit code 0: alls well +# Exit code 1: problems with getopt +# Exit code 2: problems with supported platforms + +# check if getopt command is installed +type getopt > /dev/null +if [[ $? -ne 0 ]]; then + echo "This script uses getopt to parse arguments; try installing the util-linux package"; + exit_with_error 1 +fi + +# os-release file describes the system +if [[ -e "/etc/os-release" ]]; then + source /etc/os-release +else + echo "This script depends on the /etc/os-release file" + exit_with_error 1 +fi + +# The following function exits script if an unsupported distro is detected +supported_distro + +# ################################################# +# global variables +# ################################################# +install_prefix=rocHPL +build_release=true +with_rocm=/opt/rocm +with_mpi=tpl/openmpi +with_rocblas=/opt/rocm/rocblas +with_cpublas=tpl/blis/lib +openmpi_ucx=false +verbose_print=true +progress_report=true +detailed_timing=true + +# ################################################# +# Parameter parsing +# ################################################# + +# check if we have a modern version of getopt that can handle whitespace and long parameters +getopt -T +if [[ $? -eq 4 ]]; then + GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,debug,prefix:,with-rocm:,with-mpi:,with-rocblas:,with-cpublas:,verbose-print:,progress-report:,detailed-timing: --options hg -- "$@") +else + echo "Need a new version of getopt" + exit_with_error 1 +fi + +if [[ $? -ne 0 ]]; then + echo "getopt invocation failed; could not parse the command line"; + exit_with_error 1 +fi + +eval set -- "${GETOPT_PARSE}" + +while true; do + case "${1}" in + -h|--help) + display_help + exit 0 + ;; + -g|--debug) + build_release=false + shift ;; + --prefix) + install_prefix=${2} + shift 2 ;; + --with-rocm) + with_rocm=${2} + shift 2 ;; + --with-mpi) + with_mpi=${2} + shift 2 ;; + --with-rocblas) + with_rocblas=${2} + shift 2 ;; + --with-cpublas) + with_cpublas=${2} + shift 2 ;; + --verbose-print) + verbose_print=${2} + shift 2 ;; + --progress-report) + progress_report=${2} + shift 2 ;; + --detailed-timing) + detailed_timing=${2} + shift 2 ;; + --) shift ; break ;; + *) echo "Unexpected command line parameter received; aborting"; + exit_with_error 1 + ;; + esac +done + +build_dir=./build +printf "\033[32mCreating project build directory in: \033[33m${build_dir}\033[0m\n" + +# ################################################# +# prep +# ################################################# +# ensure a clean build environment +rm -rf ${build_dir} + +# Default cmake executable is called cmake +cmake_executable=cmake + +# We append customary rocm path; if user provides custom rocm path in ${path}, our +# hard-coded path has lesser priority +export ROCM_PATH=${with_rocm} +export PATH=${PATH}:${ROCM_PATH}/bin + +pushd . + # ################################################# + # BLAS + # ################################################# + if [[ "${with_cpublas}" == tpl/blis/lib ]]; then + + install_blis + + fi + + # ################################################# + # MPI + # ################################################# + if [[ "${with_mpi}" == tpl/openmpi ]]; then + + #gpu_aware_mpi=ON #turn on GPU-aware MPI when using internal MPI library + with_mpi=${PWD}/tpl/openmpi + openmpi_ucx=true + install_openmpi + + fi + + # ################################################# + # configure & build + # ################################################# + cmake_common_options="-DCMAKE_INSTALL_PREFIX=${install_prefix} -DHPL_BLAS_DIR=${with_cpublas} + -DHPL_MPI_DIR=${with_mpi} -DROCM_PATH=${with_rocm} -DROCBLAS_PATH=${with_rocblas}" + + # build type + if [[ "${build_release}" == true ]]; then + cmake_common_options="${cmake_common_options} -DCMAKE_BUILD_TYPE=Release" + else + cmake_common_options="${cmake_common_options} -DCMAKE_BUILD_TYPE=Debug" + fi + + shopt -s nocasematch + if [[ "${verbose_print}" == on || "${verbose_print}" == true || "${verbose_print}" == 1 || "${verbose_print}" == enabled ]]; then + cmake_common_options="${cmake_common_options} -DHPL_VERBOSE_PRINT=ON" + fi + if [[ "${progress_report}" == on || "${progress_report}" == true || "${progress_report}" == 1 || "${progress_report}" == enabled ]]; then + cmake_common_options="${cmake_common_options} -DHPL_PROGRESS_REPORT=ON" + fi + if [[ "${detailed_timing}" == on || "${detailed_timing}" == true || "${detailed_timing}" == 1 || "${detailed_timing}" == enabled ]]; then + cmake_common_options="${cmake_common_options} -DHPL_DETAILED_TIMING=ON" + fi + shopt -u nocasematch + + if [[ "${openmpi_ucx}" == true ]]; then + cmake_common_options="${cmake_common_options} -DHPL_OPENMPI_UCX=ON" + fi + + # Build library with AMD toolchain because of existence of device kernels + mkdir -p ${build_dir} && cd ${build_dir} + ${cmake_executable} ${cmake_common_options} .. + check_exit_code 2 + + make -j$(nproc) install + check_exit_code 2 + +popd diff --git a/scripts/HPL.dat b/scripts/HPL.dat new file mode 100644 index 0000000..3b7fec0 --- /dev/null +++ b/scripts/HPL.dat @@ -0,0 +1,31 @@ +HPLinpack benchmark input file +Innovative Computing Laboratory, University of Tennessee +HPL.out output file name (if any) +0 device out (6=stdout,7=stderr,file) +1 # of problems sizes (N) +45312 Ns +1 # of NBs +384 NBs +1 PMAP process mapping (0=Row-,1=Column-major) +1 # of process grids (P x Q) +1 Ps +1 Qs +16.0 threshold +1 # of panel fact +2 PFACTs (0=left, 1=Crout, 2=Right) +1 # of recursive stopping criterium +2 NBMINs (>= 1) +1 # of panels in recursion +2 NDIVs +1 # of recursive panel fact. +2 RFACTs (0=left, 1=Crout, 2=Right) +1 # of broadcast +6 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM) +1 # of lookahead depth +1 DEPTHs (>=0) +1 SWAP (0=bin-exch,1=long,2=mix) +64 swapping threshold +1 L1 in (0=transposed,1=no-transposed) form +0 U in (0=transposed,1=no-transposed) form +0 Equilibration (0=no,1=yes) +8 memory alignment in double (> 0) diff --git a/scripts/mpirun_rochpl.in b/scripts/mpirun_rochpl.in new file mode 100755 index 0000000..8138238 --- /dev/null +++ b/scripts/mpirun_rochpl.in @@ -0,0 +1,198 @@ +#!/usr/bin/env bash +# Author: Noel Chalmers + +# set -x #echo on + +# ################################################# +# helper functions +# ################################################# +function display_help() +{ + echo "rocHPL MPI run helper script" + echo "./mpirun_rochpl " + echo " [-P] Specific MPI grid size: the number of " + echo " rows in MPI grid. " + echo " [-Q] Specific MPI grid size: the number of " + echo " columns in MPI grid. " + echo " [-p] Specific node-local MPI grid size: the number " + echo " of rows in node-local MPI grid. Must evenly " + echo " divide P. " + echo " [-q] Specific node-local MPI grid size: the number " + echo " of columns in node-local MPI grid. Must evenly" + echo " divide Q. " + echo " [-N] Specific matrix size: the number of " + echo " rows/columns in global matrix. " + echo " [--NB] Specific panel size: the number of " + echo " rows/columns in panels. " + echo " [-f] Specific split fraction: the percentange to " + echo " split the trailing submatrix. " + echo " [-i] Input file. When set, all other commnand " + echo " line parameters are ignored, and problem " + echo " parameters are read from input file. " + echo " [-h|--help] prints this help message " + echo " [--version] Print rocHPL version number. " +} + +# This function is helpful for dockerfiles that do not have sudo installed, but the default user is root +# true is a system command that completes successfully, function returns success +# prereq: ${ID} must be defined before calling +supported_distro( ) +{ + if [ -z ${ID+foo} ]; then + printf "supported_distro(): \$ID must be set\n" + exit 2 + fi + + case "${ID}" in + ubuntu|centos|rhel|fedora|sles) + true + ;; + *) printf "This script is currently supported on Ubuntu, CentOS, RHEL, Fedora and SLES\n" + exit 2 + ;; + esac +} + +# ################################################# +# Pre-requisites check +# ################################################# +# Exit code 0: alls well +# Exit code 1: problems with getopt +# Exit code 2: problems with supported platforms + +# check if getopt command is installed +type getopt > /dev/null +if [[ $? -ne 0 ]]; then + echo "This script uses getopt to parse arguments; try installing the util-linux package"; + exit 1 +fi + +# os-release file describes the system +if [[ -e "/etc/os-release" ]]; then + source /etc/os-release +else + echo "This script depends on the /etc/os-release file" + exit 2 +fi + +# The following function exits script if an unsupported distro is detected +supported_distro + +# ################################################# +# global variables +# ################################################# +# Grab options from CMake config +rochpl_bin=@CMAKE_INSTALL_PREFIX@/bin/rochpl +mpi_bin=@MPIEXEC_EXECUTABLE@ +rochpl_runscript=$(dirname "$0")/run_rochpl #assume run_rochpl is in the same location +openmpi_ucx=@HPL_OPENMPI_UCX@ + +P=1 +Q=1 +p=-1 +q=-1 +N=45312 +NB=384 +frac=0.6 + +filename=HPL.dat +inputfile=false +cmdrun=false + +# ################################################# +# Parameter parsing +# ################################################# + +# check if we have a modern version of getopt that can handle whitespace and long parameters +getopt -T +if [[ $? -eq 4 ]]; then + GETOPT_PARSE=$(getopt --name "${0}" --longoptions NB:,help,version, --options hP:Q:p:q:N:i:f: -- "$@") +else + echo "Need a new version of getopt" + exit 1 +fi + +if [[ $? -ne 0 ]]; then + echo "getopt invocation failed; could not parse the command line"; + exit 1 +fi + +eval set -- "${GETOPT_PARSE}" + +while true; do + case "${1}" in + -h|--help) + display_help + exit 0 + ;; + --version) + ${mpi_bin} -np 1 ${rochpl_bin} --version + exit 0 + ;; + -P) + P=${2} + shift 2 ;; + -Q) + Q=${2} + shift 2 ;; + -p) + p=${2} + shift 2 ;; + -q) + q=${2} + shift 2 ;; + -N) + N=${2} + cmdrun=true + shift 2 ;; + --NB) + NB=${2} + cmdrun=true + shift 2 ;; + -f) + frac=${2} + shift 2 ;; + -i) + filename=${2} + inputfile=true + shift 2 ;; + --) shift ; break ;; + *) echo "Unexpected command line parameter received; aborting"; + exit 1 + ;; + esac +done + +#if nothing but np and ppn parameters where given, default to running +# with default input file +if [[ "${inputfile}" == false && "${cmdrun}" == false ]]; then + inputfile=true +fi + +np=$(($P*$Q)) +if [[ "$np" -lt 1 ]]; then + echo "Invalid MPI grid parameters; aborting"; + exit 1 +fi + +# count the number of physical cores +num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}') +num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}') +total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets)) + +#Default MPI options +mpi_args="--map-by node:PE=${total_cpu_cores} --bind-to core:overload-allowed" + +if [[ ${openmpi_ucx} == ON ]]; then + # run with openmpi + ucx + mpi_args="--mca pml ucx --mca btl ^vader,tcp,openib,uct ${mpi_args}" +fi + +if [[ "${inputfile}" == true ]]; then + rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -i ${filename} -f ${frac}" +else + rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -N ${N} --NB ${NB} -f ${frac}" +fi + +#run +${mpi_bin} -np ${np} ${mpi_args} ${rochpl_runscript} ${rochpl_args} diff --git a/scripts/run_rochpl.in b/scripts/run_rochpl.in new file mode 100755 index 0000000..5684932 --- /dev/null +++ b/scripts/run_rochpl.in @@ -0,0 +1,410 @@ +#!/usr/bin/env bash +# Author: Noel Chalmers + +# set -x #echo on + +# ################################################# +# helper functions +# ################################################# +function display_help() +{ + echo "rocHPL run helper script" + echo "./run_rochpl " + echo " [-P] Specific MPI grid size: the number of " + echo " rows in MPI grid. " + echo " [-Q] Specific MPI grid size: the number of " + echo " columns in MPI grid. " + echo " [-p] Specific node-local MPI grid size: the number " + echo " of rows in node-local MPI grid. Must evenly " + echo " divide P. " + echo " [-q] Specific node-local MPI grid size: the number " + echo " of columns in node-local MPI grid. Must evenly" + echo " divide Q. " + echo " [-N] Specific matrix size: the number of " + echo " rows/columns in global matrix. " + echo " [--NB] Specific panel size: the number of " + echo " rows/columns in panels. " + echo " [-f] Specific split fraction: the percentange to " + echo " split the trailing submatrix. " + echo " [-i] Input file. When set, all other commnand " + echo " line parameters are ignored, and problem " + echo " parameters are read from input file. " + echo " [-h|--help] prints this help message " + echo " [--version] Print rocHPL version number. " +} + +# This function is helpful for dockerfiles that do not have sudo installed, but the default user is root +# true is a system command that completes successfully, function returns success +# prereq: ${ID} must be defined before calling +supported_distro( ) +{ + if [ -z ${ID+foo} ]; then + printf "supported_distro(): \$ID must be set\n" + exit 2 + fi + + case "${ID}" in + ubuntu|centos|rhel|fedora|sles) + true + ;; + *) printf "This script is currently supported on Ubuntu, CentOS, RHEL, Fedora and SLES\n" + exit 2 + ;; + esac +} + +# ################################################# +# Pre-requisites check +# ################################################# +# Exit code 0: alls well +# Exit code 1: problems with getopt +# Exit code 2: problems with supported platforms + +# check if getopt command is installed +type getopt > /dev/null +if [[ $? -ne 0 ]]; then + echo "This script uses getopt to parse arguments; try installing the util-linux package"; + exit 1 +fi + +# os-release file describes the system +if [[ -e "/etc/os-release" ]]; then + source /etc/os-release +else + echo "This script depends on the /etc/os-release file" + exit 2 +fi + +# The following function exits script if an unsupported distro is detected +supported_distro + +# ################################################# +# global variables +# ################################################# +# Grab options from CMake config +rochpl_bin=@CMAKE_INSTALL_PREFIX@/bin/rochpl +rocm_dir=@ROCM_PATH@ +rocblas_dir=@ROCBLAS_LIB_PATH@ +blas_dir=@HPL_BLAS_DIR@ + +P=1 +Q=1 +p=-1 +q=-1 +N=45312 +NB=384 +frac=0.6 + +filename=HPL.dat +inputfile=false +cmdrun=false + +export LD_LIBRARY_PATH=${rocblas_dir}:${blas_dir}:${rocm_dir}/lib:$LD_LIBRARY_PATH + +oversubscribe=true + +# ################################################# +# Parameter parsing +# ################################################# + +# check if we have a modern version of getopt that can handle whitespace and long parameters +getopt -T +if [[ $? -eq 4 ]]; then + GETOPT_PARSE=$(getopt --name "${0}" --longoptions NB:,help,version, --options hP:Q:p:q:N:i:f: -- "$@") +else + echo "Need a new version of getopt" + exit 1 +fi + +if [[ $? -ne 0 ]]; then + echo "getopt invocation failed; could not parse the command line"; + exit 1 +fi + +eval set -- "${GETOPT_PARSE}" + +while true; do + case "${1}" in + -h|--help) + display_help + exit 0 + ;; + --version) + ${rochpl_bin} --version + exit 0 + ;; + -P) + P=${2} + shift 2 ;; + -Q) + Q=${2} + shift 2 ;; + -p) + p=${2} + shift 2 ;; + -q) + q=${2} + shift 2 ;; + -N) + N=${2} + cmdrun=true + shift 2 ;; + --NB) + NB=${2} + cmdrun=true + shift 2 ;; + -f) + frac=${2} + shift 2 ;; + -i) + filename=${2} + inputfile=true + shift 2 ;; + --) shift ; break ;; + *) echo "Unexpected command line parameter received; aborting"; + exit 1 + ;; + esac +done + +#if nothing but np and ppn parameters where given, default to running +# with default input file +if [[ "${inputfile}" == false && "${cmdrun}" == false ]]; then + inputfile=true +fi + +np=$(($P*$Q)) +if [[ "$np" -lt 1 ]]; then + echo "Invalid MPI grid parameters; aborting"; + exit 1 +fi + +####################################### +# Now figure out the CPU core mappings +####################################### + +# Get local process numbering +set +u +if [[ -n ${OMPI_COMM_WORLD_LOCAL_RANK+x} ]]; then + globalRank=$OMPI_COMM_WORLD_RANK + globalSize=$OMPI_COMM_WORLD_SIZE + rank=$OMPI_COMM_WORLD_LOCAL_RANK + size=$OMPI_COMM_WORLD_LOCAL_SIZE +elif [[ -n ${SLURM_LOCALID+x} ]]; then + globalRank=$SLURM_PROCID + globalSize=$SLURM_NTASKS + rank=$SLURM_LOCALID + size=$SLURM_TASKS_PER_NODE + #Slurm can return a string like "2(x2),1". Get the first number + size=$(echo $size | sed -r 's/^([^.]+).*$/\1/; s/^[^0-9]*([0-9]+).*$/\1/') +fi +set -u + +#Determing node-local grid size +if [[ "$p" -lt 1 && "$q" -lt 1 ]]; then + # no node-local grid was specified, pick defaults + q=$(( (Q<=size) ? Q : size)) + + if [[ $((size % q)) -gt 0 ]]; then + echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting"; + exit 1 + fi + + p=$(( size/q )) + +elif [[ "$p" -lt 1 ]]; then + #q was specified + + if [[ $((size % q)) -gt 0 ]]; then + echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting"; + exit 1 + fi + + p=$(( size/q )) + +elif [[ "$q" -lt 1 ]]; then + #p was specified + + if [[ $((size % p)) -gt 0 ]]; then + echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting"; + exit 1 + fi + + q=$(( size/p )) + +else + #Both p and q were specified + if [[ $size -ne $((p*q)) ]]; then + echo "Invalid MPI grid parameters; Unable to form node-local grid; aborting"; + exit 1 + fi +fi + +# Check that the columns are evenly divided among nodes +if [[ $((P % p)) -gt 0 ]]; then + echo "Invalid MPI grid parameters; Must have the same number of P rows on every node; aborting"; + exit 1 +fi + +# Check that the rows are evenly divided among nodes +if [[ $((Q % q)) -gt 0 ]]; then + echo "Invalid MPI grid parameters; Must have the same number of Q columns on every node; aborting"; + exit 1 +fi + +# count the number of physical cores on node +num_cpu_cores=$(lscpu | grep "Core(s)" | awk '{print $4}') +num_cpu_sockets=$(lscpu | grep Socket | awk '{print $2}') +total_cpu_cores=$(($num_cpu_cores*$num_cpu_sockets)) + +# Ranks in different processes rows will take distinct chunks of cores +row_stride=$((total_cpu_cores/p)) +col_stride=$((row_stride/q)) + +myp=$((rank%p)) +myq=$((rank/p)) + +#Although ranks are column-major order, we select GPUs in row-major order on node +mygpu=$((myq+myp*q)) + +# Try to detect special Bard-peak core mapping +if [[ -n ${HPL_PLATFORM+x} ]]; then + platform=$HPL_PLATFORM +else + platform=$(cat /sys/class/dmi/id/product_name) +fi + +if [[ "$platform" == "BardPeak" || "$platform" == "HPE_CRAY_EX235A" ]]; then + # Special core mapping for BardPeak + + # Debug + # if [[ $globalRank == 0 ]]; then + # echo "BardPeak platform detected" + # fi + + # Sanity check + if [[ $size -gt 8 ]]; then + echo "Unsupported number of ranks on BardPeak platform; aborting"; + exit 1 + fi + + # GCD0 cores="48-55" + # GCD1 cores="56-63" + # GCD2 cores="16-23" + # GCD3 cores="24-31" + # GCD4 cores="0-7" + # GCD5 cores="8-15" + # GCD6 cores="32-39" + # GCD7 cores="40-47" + + root_cores=(48 56 16 24 0 8 32 40) + root_core=${root_cores[mygpu]} + + # First omp place is the root core + omp_places="{$root_core}" + + # First assign the CCD + for i in $(seq $((root_core+1)) $((root_core+8-1))) + do + omp_places+=",{$i}" + done + omp_num_threads=8 + + places="{$root_core-$((root_core+7))}" + + # Loop through unassigned CCDs + for c in $(seq $((mygpu+size)) $size 7) + do + iroot_core=${root_cores[c]} + for i in $(seq $((iroot_core)) $((iroot_core+8-1))) + do + omp_places+=",{$i}" + done + omp_num_threads=$((omp_num_threads+8)) + places+=",{$iroot_core-$((iroot_core+7))}" + done + + if [[ "${oversubscribe}" == true ]]; then + # Add cores from different columns, without their root cores + for j in $(seq 0 $((q-1))) + do + if [[ "$j" == "$myq" ]]; then + continue + fi + for jj in $(seq 0 $size 7) + do + q_gpu=$((jj+j+myp*q)) + q_core=$((root_cores[q_gpu])) + offset=$(( (q_gpu>=size) ? 0 : 1)) + for i in $(seq $((q_core+offset)) $((q_core+8-1))) + do + omp_places+=",{$i}" + done + omp_num_threads=$((omp_num_threads+8-offset)) + places+=",{$((q_core+offset))-$((q_core+7))}" + done + done + fi + +else + # Default core mapping + root_core=$((myp*row_stride + myq*col_stride)) + + omp_num_threads=${col_stride} + # First omp place is the root core + omp_places="{$root_core}" + + # Make contiuguous chunk of cores (to maximize L1/L2 locality) + for i in $(seq $((root_core+1)) $((root_core+col_stride-1))) + do + omp_places+=",{$i}" + done + + if [[ $col_stride -gt 1 ]]; then + places="{$root_core-$((root_core+col_stride-1))}" + else + places="{$root_core}" + fi + + if [[ "${oversubscribe}" == true ]]; then + # Add cores from different columns, without their root cores + for j in $(seq 0 $((q-1))) + do + if [[ "$j" == "$myq" ]]; then + continue + fi + q_core=$((myp*row_stride + j*col_stride)) + for i in $(seq $((q_core+1)) $((q_core+col_stride-1))) + do + omp_places+=",{$i}" + done + omp_num_threads=$((omp_num_threads+col_stride-1)) + + if [[ $col_stride -gt 2 ]]; then + places+=",{$((q_core+1))-$((q_core+col_stride-1))}" + elif [[ $col_stride -gt 1 ]]; then + places+=",{$((q_core+1))}" + fi + + done + fi +fi + +# Export OpenMP config +export OMP_NUM_THREADS=${omp_num_threads} +export OMP_PLACES=${omp_places} +export OMP_PROC_BIND=true + +if [[ $globalRank -lt $size ]]; then + echo "Node Binding: Process $rank [(p,q)=($myp,$myq)] CPU Cores: $omp_num_threads - $places" +fi + +rochpl_args="-P ${P} -Q ${Q} -p ${p} -q ${q} -f ${frac}" +if [[ "${inputfile}" == true ]]; then + rochpl_args+=" -i ${filename}" +else + rochpl_args+=" -N ${N} -NB ${NB}" +fi + +#run +${rochpl_bin} ${rochpl_args} diff --git a/src/HPL_InitGPU.cpp b/src/HPL_InitGPU.cpp new file mode 100644 index 0000000..f5b62b7 --- /dev/null +++ b/src/HPL_InitGPU.cpp @@ -0,0 +1,119 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include + +rocblas_handle handle; + +hipStream_t computeStream, dataStream; + +hipEvent_t swapStartEvent[HPL_N_UPD], update[HPL_N_UPD]; +hipEvent_t dgemmStart[HPL_N_UPD], dgemmStop[HPL_N_UPD]; + +static char host_name[MPI_MAX_PROCESSOR_NAME]; + +/* + This function finds out how many MPI processes are running on the same node + and assigns a local rank that can be used to map a process to a device. + This function needs to be called by all the MPI processes. +*/ +void HPL_InitGPU(const HPL_T_grid* GRID) { + char host_name[MPI_MAX_PROCESSOR_NAME]; + + int i, n, namelen, rank, nprocs; + int dev; + + int nprow, npcol, myrow, mycol; + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + MPI_Get_processor_name(host_name, &namelen); + + int localRank = GRID->local_mycol + GRID->local_myrow * GRID->local_npcol; + int localSize = GRID->local_npcol * GRID->local_nprow; + + /* Find out how many GPUs are in the system and their device number */ + int deviceCount; + hipGetDeviceCount(&deviceCount); + + if(deviceCount < 1) { + if(localRank == 0) + HPL_pwarn(stderr, + __LINE__, + "HPL_InitGPU", + "Node %s found no GPUs. Is the ROCm kernel module loaded?", + host_name); + MPI_Finalize(); + exit(1); + } + + dev = localRank % deviceCount; + +#ifdef HPL_VERBOSE_PRINT + if(rank < localSize) { + hipDeviceProp_t props; + hipGetDeviceProperties(&props, dev); + + printf("GPU Binding: Process %d [(p,q)=(%d,%d)] GPU: %d, pciBusID %x \n", + rank, + GRID->local_myrow, + GRID->local_mycol, + dev, + props.pciBusID); + } +#endif + + /* Assign device to MPI process, initialize BLAS and probe device properties + */ + hipSetDevice(dev); + + hipStreamCreate(&computeStream); + hipStreamCreate(&dataStream); + + hipEventCreate(swapStartEvent + HPL_LOOK_AHEAD); + hipEventCreate(swapStartEvent + HPL_UPD_1); + hipEventCreate(swapStartEvent + HPL_UPD_2); + + hipEventCreate(update + HPL_LOOK_AHEAD); + hipEventCreate(update + HPL_UPD_1); + hipEventCreate(update + HPL_UPD_2); + + hipEventCreate(dgemmStart + HPL_LOOK_AHEAD); + hipEventCreate(dgemmStart + HPL_UPD_1); + hipEventCreate(dgemmStart + HPL_UPD_2); + + hipEventCreate(dgemmStop + HPL_LOOK_AHEAD); + hipEventCreate(dgemmStop + HPL_UPD_1); + hipEventCreate(dgemmStop + HPL_UPD_2); +} + +void HPL_FreeGPU() { + hipEventDestroy(swapStartEvent[HPL_LOOK_AHEAD]); + hipEventDestroy(swapStartEvent[HPL_UPD_1]); + hipEventDestroy(swapStartEvent[HPL_UPD_2]); + + hipEventDestroy(update[HPL_LOOK_AHEAD]); + hipEventDestroy(update[HPL_UPD_1]); + hipEventDestroy(update[HPL_UPD_2]); + + hipEventDestroy(dgemmStart[HPL_LOOK_AHEAD]); + hipEventDestroy(dgemmStart[HPL_UPD_1]); + hipEventDestroy(dgemmStart[HPL_UPD_2]); + + hipEventDestroy(dgemmStop[HPL_LOOK_AHEAD]); + hipEventDestroy(dgemmStop[HPL_UPD_1]); + hipEventDestroy(dgemmStop[HPL_UPD_2]); + + hipStreamDestroy(dataStream); + hipStreamDestroy(computeStream); +} diff --git a/src/HPL_pddriver.cpp b/src/HPL_pddriver.cpp new file mode 100644 index 0000000..8b65d4d --- /dev/null +++ b/src/HPL_pddriver.cpp @@ -0,0 +1,285 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int main(int ARGC, char** ARGV) { + /* + * Purpose + * ======= + * + * main is the main driver program for testing the HPL routines. + * This program is driven by a short data file named "HPL.dat". + * + * --------------------------------------------------------------------- + */ + + int nval[HPL_MAX_PARAM], nbval[HPL_MAX_PARAM], pval[HPL_MAX_PARAM], + qval[HPL_MAX_PARAM], nbmval[HPL_MAX_PARAM], ndvval[HPL_MAX_PARAM], + ndhval[HPL_MAX_PARAM]; + + HPL_T_FACT pfaval[HPL_MAX_PARAM], rfaval[HPL_MAX_PARAM]; + + HPL_T_TOP topval[HPL_MAX_PARAM]; + + HPL_T_grid grid; + HPL_T_palg algo; + HPL_T_test test; + int L1notran, Unotran, align, equil, in, inb, inbm, indh, indv, ipfa, ipq, + irfa, itop, mycol, myrow, ns, nbs, nbms, ndhs, ndvs, npcol, npfs, npqs, + nprow, nrfs, ntps, rank, size, tswap; + HPL_T_ORDER pmapping; + HPL_T_FACT rpfa; + HPL_T_SWAP fswap; + double frac; + int p, q; + + MPI_Init(&ARGC, &ARGV); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + MPI_Op_create(HPL_dmxswp, true, &HPL_DMXSWP); + + /* + * Read and check validity of test parameters from input file + * + * HPL Version 1.0, Linpack benchmark input file + * Your message here + * HPL.out output file name (if any) + * 6 device out (6=stdout,7=stderr,file) + * 4 # of problems sizes (N) + * 29 30 34 35 Ns + * 4 # of NBs + * 1 2 3 4 NBs + * 0 PMAP process mapping (0=Row-,1=Column-major) + * 3 # of process grids (P x Q) + * 2 1 4 Ps + * 2 4 1 Qs + * 16.0 threshold + * 3 # of panel fact + * 0 1 2 PFACTs (0=left, 1=Crout, 2=Right) + * 2 # of recursive stopping criterium + * 2 4 NBMINs (>= 1) + * 1 # of panels in recursion + * 2 NDIVs + * 3 # of recursive panel fact. + * 0 1 2 RFACTs (0=left, 1=Crout, 2=Right) + * 1 # of broadcast + * 0 BCASTs (0=1rg,1=1rM,2=2rg,3=2rM,4=Lng,5=LnM) + * 1 # of lookahead depth + * 0 DEPTHs (>=0) + * 2 SWAP (0=bin-exch,1=long,2=mix) + * 4 swapping threshold + * 0 L1 in (0=transposed,1=no-transposed) form + * 0 U in (0=transposed,1=no-transposed) form + * 1 Equilibration (0=no,1=yes) + * 8 memory alignment in double (> 0) + */ + HPL_pdinfo(ARGC, + ARGV, + &test, + &ns, + nval, + &nbs, + nbval, + &pmapping, + &npqs, + pval, + qval, + &p, + &q, + &npfs, + pfaval, + &nbms, + nbmval, + &ndvs, + ndvval, + &nrfs, + rfaval, + &ntps, + topval, + &ndhs, + ndhval, + &fswap, + &tswap, + &L1notran, + &Unotran, + &equil, + &align, + &frac); + + /* + * Loop over different process grids - Define process grid. Go to bottom + * of process grid loop if this case does not use my process. + */ + for(ipq = 0; ipq < npqs; ipq++) { + (void)HPL_grid_init( + MPI_COMM_WORLD, pmapping, pval[ipq], qval[ipq], p, q, &grid); + (void)HPL_grid_info(&grid, &nprow, &npcol, &myrow, &mycol); + + if((myrow < 0) || (myrow >= nprow) || (mycol < 0) || (mycol >= npcol)) + goto label_end_of_npqs; + + // Initialize GPU + HPL_InitGPU(&grid); + + for(in = 0; in < ns; in++) { /* Loop over various problem sizes */ + for(inb = 0; inb < nbs; inb++) { /* Loop over various blocking factors */ + for(indh = 0; indh < ndhs; + indh++) { /* Loop over various lookahead depths */ + for(itop = 0; itop < ntps; + itop++) { /* Loop over various broadcast topologies */ + for(irfa = 0; irfa < nrfs; + irfa++) { /* Loop over various recursive factorizations */ + for(ipfa = 0; ipfa < npfs; + ipfa++) { /* Loop over various panel factorizations */ + for(inbm = 0; inbm < nbms; + inbm++) { /* Loop over various recursive stopping criteria + */ + for(indv = 0; indv < ndvs; + indv++) { /* Loop over various # of panels in recursion */ + /* + * Set up the algorithm parameters + */ + algo.btopo = topval[itop]; + algo.depth = ndhval[indh]; + algo.nbmin = nbmval[inbm]; + algo.nbdiv = ndvval[indv]; + + algo.pfact = rpfa = pfaval[ipfa]; + + if(L1notran != 0) { + if(rpfa == HPL_LEFT_LOOKING) + algo.pffun = HPL_pdpanllN; + else if(rpfa == HPL_CROUT) + algo.pffun = HPL_pdpancrN; + else + algo.pffun = HPL_pdpanrlN; + + algo.rfact = rpfa = rfaval[irfa]; + if(rpfa == HPL_LEFT_LOOKING) + algo.rffun = HPL_pdrpanllN; + else if(rpfa == HPL_CROUT) + algo.rffun = HPL_pdrpancrN; + else + algo.rffun = HPL_pdrpanrlN; + + algo.upfun = HPL_pdupdateNT; + } else { + if(rpfa == HPL_LEFT_LOOKING) + algo.pffun = HPL_pdpanllT; + else if(rpfa == HPL_CROUT) + algo.pffun = HPL_pdpancrT; + else + algo.pffun = HPL_pdpanrlT; + + algo.rfact = rpfa = rfaval[irfa]; + if(rpfa == HPL_LEFT_LOOKING) + algo.rffun = HPL_pdrpanllT; + else if(rpfa == HPL_CROUT) + algo.rffun = HPL_pdrpancrT; + else + algo.rffun = HPL_pdrpanrlT; + + algo.upfun = HPL_pdupdateTT; + } + + algo.fswap = fswap; + algo.fsthr = tswap; + algo.equil = equil; + algo.align = align; + + algo.frac = frac; + + HPL_pdtest(&test, &grid, &algo, nval[in], nbval[inb]); + } + } + } + } + } + } + } + } + (void)HPL_grid_exit(&grid); + HPL_FreeGPU(); + + label_end_of_npqs:; + } + /* + * Print ending messages, close output file, exit. + */ + if(rank == 0) { + test.ktest = test.kpass + test.kfail + test.kskip; +#ifndef HPL_DETAILED_TIMING + HPL_fprintf(test.outfp, + "%s%s\n", + "========================================", + "========================================"); +#else + if(test.thrsh > HPL_rzero) + HPL_fprintf(test.outfp, + "%s%s\n", + "========================================", + "========================================"); +#endif + + HPL_fprintf(test.outfp, + "\n%s %6d %s\n", + "Finished", + test.ktest, + "tests with the following results:"); + if(test.thrsh > HPL_rzero) { + HPL_fprintf(test.outfp, + " %6d %s\n", + test.kpass, + "tests completed and passed residual checks,"); + HPL_fprintf(test.outfp, + " %6d %s\n", + test.kfail, + "tests completed and failed residual checks,"); + HPL_fprintf(test.outfp, + " %6d %s\n", + test.kskip, + "tests skipped because of illegal input values."); + } else { + HPL_fprintf(test.outfp, + " %6d %s\n", + test.kpass, + "tests completed without checking,"); + HPL_fprintf(test.outfp, + " %6d %s\n", + test.kskip, + "tests skipped because of illegal input values."); + } + + HPL_fprintf(test.outfp, + "%s%s\n", + "----------------------------------------", + "----------------------------------------"); + HPL_fprintf(test.outfp, "\nEnd of Tests.\n"); + HPL_fprintf(test.outfp, + "%s%s\n", + "========================================", + "========================================"); + + if((test.outfp != stdout) && (test.outfp != stderr)) + (void)fclose(test.outfp); + } + + MPI_Finalize(); + + return (0); +} diff --git a/src/HPL_pdinfo.cpp b/src/HPL_pdinfo.cpp new file mode 100644 index 0000000..f04e79e --- /dev/null +++ b/src/HPL_pdinfo.cpp @@ -0,0 +1,1557 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include +#include +#include + +void HPL_pdinfo(int ARGC, + char** ARGV, + HPL_T_test* TEST, + int* NS, + int* N, + int* NBS, + int* NB, + HPL_T_ORDER* PMAPPIN, + int* NPQS, + int* P, + int* Q, + int* p, + int* q, + int* NPFS, + HPL_T_FACT* PF, + int* NBMS, + int* NBM, + int* NDVS, + int* NDV, + int* NRFS, + HPL_T_FACT* RF, + int* NTPS, + HPL_T_TOP* TP, + int* NDHS, + int* DH, + HPL_T_SWAP* FSWAP, + int* TSWAP, + int* L1NOTRAN, + int* UNOTRAN, + int* EQUIL, + int* ALIGN, + double* FRAC) { + /* + * Purpose + * ======= + * + * HPL_pdinfo reads the startup information for the various tests and + * transmits it to all processes. + * + * Arguments + * ========= + * + * TEST (global output) HPL_T_test * + * On entry, TEST points to a testing data structure. On exit, + * the fields of this data structure are initialized as follows: + * TEST->outfp specifies the output file where the results will + * be printed. It is only defined and used by the process 0 of + * the grid. TEST->thrsh specifies the threshhold value for the + * test ratio. TEST->epsil is the relative machine precision of + * the distributed computer. Finally the test counters, kfail, + * kpass, kskip, ktest are initialized to zero. + * + * NS (global output) int * + * On exit, NS specifies the number of different problem sizes + * to be tested. NS is less than or equal to HPL_MAX_PARAM. + * + * N (global output) int * + * On entry, N is an array of dimension HPL_MAX_PARAM. On exit, + * the first NS entries of this array contain the problem sizes + * to run the code with. + * + * NBS (global output) int * + * On exit, NBS specifies the number of different distribution + * blocking factors to be tested. NBS must be less than or equal + * to HPL_MAX_PARAM. + * + * NB (global output) int * + * On exit, PMAPPIN specifies the process mapping onto the no- + * des of the MPI machine configuration. PMAPPIN defaults to + * row-major ordering. + * + * PMAPPIN (global output) HPL_T_ORDER * + * On entry, NB is an array of dimension HPL_MAX_PARAM. On exit, + * the first NBS entries of this array contain the values of the + * various distribution blocking factors, to run the code with. + * + * NPQS (global output) int * + * On exit, NPQS specifies the number of different values that + * can be used for P and Q, i.e., the number of process grids to + * run the code with. NPQS must be less than or equal to + * HPL_MAX_PARAM. + * + * P (global output) int * + * On entry, P is an array of dimension HPL_MAX_PARAM. On exit, + * the first NPQS entries of this array contain the values of P, + * the number of process rows of the NPQS grids to run the code + * with. + * + * Q (global output) int * + * On entry, Q is an array of dimension HPL_MAX_PARAM. On exit, + * the first NPQS entries of this array contain the values of Q, + * the number of process columns of the NPQS grids to run the + * code with. + * + * p (global output) int * + * On exit, p specifies the number of rows in the node-local MPI + * grid + * + * q (global output) int * + * On exit, q specifies the number of columns in the node-local + * MPI grid + * + * NPFS (global output) int * + * On exit, NPFS specifies the number of different values that + * can be used for PF : the panel factorization algorithm to run + * the code with. NPFS is less than or equal to HPL_MAX_PARAM. + * + * PF (global output) HPL_T_FACT * + * On entry, PF is an array of dimension HPL_MAX_PARAM. On exit, + * the first NPFS entries of this array contain the various + * panel factorization algorithms to run the code with. + * + * NBMS (global output) int * + * On exit, NBMS specifies the number of various recursive + * stopping criteria to be tested. NBMS must be less than or + * equal to HPL_MAX_PARAM. + * + * NBM (global output) int * + * On entry, NBM is an array of dimension HPL_MAX_PARAM. On + * exit, the first NBMS entries of this array contain the values + * of the various recursive stopping criteria to be tested. + * + * NDVS (global output) int * + * On exit, NDVS specifies the number of various numbers of + * panels in recursion to be tested. NDVS is less than or equal + * to HPL_MAX_PARAM. + * + * NDV (global output) int * + * On entry, NDV is an array of dimension HPL_MAX_PARAM. On + * exit, the first NDVS entries of this array contain the values + * of the various numbers of panels in recursion to be tested. + * + * NRFS (global output) int * + * On exit, NRFS specifies the number of different values that + * can be used for RF : the recursive factorization algorithm to + * be tested. NRFS is less than or equal to HPL_MAX_PARAM. + * + * RF (global output) HPL_T_FACT * + * On entry, RF is an array of dimension HPL_MAX_PARAM. On exit, + * the first NRFS entries of this array contain the various + * recursive factorization algorithms to run the code with. + * + * NTPS (global output) int * + * On exit, NTPS specifies the number of different values that + * can be used for the broadcast topologies to be tested. NTPS + * is less than or equal to HPL_MAX_PARAM. + * + * TP (global output) HPL_T_TOP * + * On entry, TP is an array of dimension HPL_MAX_PARAM. On exit, + * the first NTPS entries of this array contain the various + * broadcast (along rows) topologies to run the code with. + * + * NDHS (global output) int * + * On exit, NDHS specifies the number of different values that + * can be used for the lookahead depths to be tested. NDHS is + * less than or equal to HPL_MAX_PARAM. + * + * DH (global output) int * + * On entry, DH is an array of dimension HPL_MAX_PARAM. On + * exit, the first NDHS entries of this array contain the values + * of lookahead depths to run the code with. Such a value is at + * least 0 (no-lookahead) or greater than zero. + * + * FSWAP (global output) HPL_T_SWAP * + * On exit, FSWAP specifies the swapping algorithm to be used in + * all tests. + * + * TSWAP (global output) int * + * On exit, TSWAP specifies the swapping threshold as a number + * of columns when the mixed swapping algorithm was chosen. + * + * L1NOTRA (global output) int * + * On exit, L1NOTRAN specifies whether the upper triangle of the + * panels of columns should be stored in no-transposed form + * (L1NOTRAN=1) or in transposed form (L1NOTRAN=0). + * + * UNOTRAN (global output) int * + * On exit, UNOTRAN specifies whether the panels of rows should + * be stored in no-transposed form (UNOTRAN=1) or transposed + * form (UNOTRAN=0) during their broadcast. + * + * EQUIL (global output) int * + * On exit, EQUIL specifies whether equilibration during the + * swap-broadcast of the panel of rows should be performed + * (EQUIL=1) or not (EQUIL=0). + * + * ALIGN (global output) int * + * On exit, ALIGN specifies the alignment of the dynamically + * allocated buffers in double precision words. ALIGN is greater + * than zero. + * + * FRAC (global output) double * + * On exit, FRAC specifies the percentage in which to split the + * the trailing update. + * + * --------------------------------------------------------------------- + */ + + char file[HPL_LINE_MAX], line[HPL_LINE_MAX], auth[HPL_LINE_MAX], + num[HPL_LINE_MAX]; + FILE* infp; + int* iwork = NULL; + char* lineptr; + int error = 0, fid, i, j, lwork, maxp, nprocs, rank, size; + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + /* + * Initialize the TEST data structure with default values + */ + TEST->outfp = stderr; + TEST->epsil = 2.0e-16; + TEST->thrsh = 16.0; + TEST->kfail = TEST->kpass = TEST->kskip = TEST->ktest = 0; + + // parse settings + int _P = 1, _Q = 1, n = 45312, nb = 384; + int _p = -1, _q = -1; + bool cmdlinerun = false; + bool inputfile = false; + double frac = 0.6; + std::string inputFileName = "HPL.dat"; + + for(int i = 1; i < ARGC; i++) { + if(strcmp(ARGV[i], "-h") == 0 || strcmp(ARGV[i], "--help") == 0) { + if(rank == 0) { + std::cout + << "rocHPL client command line options: " + " \n" + "-P [ --ranksP ] arg (=1) Specific MPI grid " + "size: the number of \n" + " rows in MPI grid. " + " \n" + "-Q [ --ranksQ ] arg (=1) Specific MPI grid " + "size: the number of \n" + " columns in MPI grid. " + " \n" + "-N [ --sizeN ] arg (=45312) Specific matrix size: " + "the number of rows \n" + " /columns in global " + "matrix. \n" + "-NB [ --sizeNB ] arg (=384) Specific panel size: " + "the number of rows \n" + " /columns in panels. " + " \n" + "-f [ --frac ] arg (=0.6) Specific update split: " + "the percentage to \n" + " split the trailing " + "submatrix. \n" + "-i [ --input ] arg (=HPL.dat) Input file. When set, " + "all other commnand \n" + " line parameters are " + "ignored, and problem \n" + " parameters are read " + "from input file. \n" + "-h [ --help ] Produces this help " + "message \n" + "--version Prints the version " + "number \n"; + } + MPI_Barrier(MPI_COMM_WORLD); + MPI_Finalize(); + exit(0); + } + + if(strcmp(ARGV[i], "--version") == 0) { + if(rank == 0) { + std::cout << "rocHPL version: " << __ROCHPL_VER_MAJOR << "." + << __ROCHPL_VER_MINOR << "." << __ROCHPL_VER_PATCH + << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + MPI_Finalize(); + exit(0); + } + + if(strcmp(ARGV[i], "-P") == 0 || strcmp(ARGV[i], "--ranksP") == 0) { + _P = atoi(ARGV[i + 1]); + cmdlinerun = true; + i++; + if(_P < 1) { + if(rank == 0) + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "Illegal value for P. Exiting ..."); + MPI_Finalize(); + exit(1); + } + } + if(strcmp(ARGV[i], "-Q") == 0 || strcmp(ARGV[i], "--ranksQ") == 0) { + _Q = atoi(ARGV[i + 1]); + cmdlinerun = true; + i++; + if(_Q < 1) { + if(rank == 0) + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "Illegal value for Q. Exiting ..."); + MPI_Finalize(); + exit(1); + } + } + if(strcmp(ARGV[i], "-p") == 0) { + _p = atoi(ARGV[i + 1]); + cmdlinerun = true; + i++; + } + if(strcmp(ARGV[i], "-q") == 0) { + _q = atoi(ARGV[i + 1]); + cmdlinerun = true; + i++; + } + + if(strcmp(ARGV[i], "-N") == 0 || strcmp(ARGV[i], "--sizeN") == 0) { + n = atoi(ARGV[i + 1]); + cmdlinerun = true; + i++; + if(n < 1) { + if(rank == 0) + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "Illegal value for N. Exiting ..."); + MPI_Finalize(); + exit(1); + } + } + if(strcmp(ARGV[i], "-NB") == 0 || strcmp(ARGV[i], "--sizeNB") == 0) { + nb = atoi(ARGV[i + 1]); + cmdlinerun = true; + i++; + if(nb < 1) { + if(rank == 0) + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "Illegal value for NB. Exiting ..."); + MPI_Finalize(); + exit(1); + } + } + if(strcmp(ARGV[i], "-f") == 0 || strcmp(ARGV[i], "--frac") == 0) { + frac = atof(ARGV[i + 1]); + i++; + } + if(strcmp(ARGV[i], "-i") == 0 || strcmp(ARGV[i], "--input") == 0) { + inputFileName = ARGV[i + 1]; + inputfile = true; + i++; + } + } + + /* + * Check for enough processes in machine configuration + */ + maxp = _P * _Q; + if(maxp > size) { + if(rank == 0) + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "Need at least %d processes for these tests", + maxp); + MPI_Finalize(); + exit(1); + } + + /* + * Split fraction + */ + *FRAC = frac; + + /*Node-local grid*/ + MPI_Comm nodeComm; + MPI_Comm_split_type( + MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, rank, MPI_INFO_NULL, &nodeComm); + + int localRank; + int localSize; + MPI_Comm_rank(nodeComm, &localRank); + MPI_Comm_size(nodeComm, &localSize); + + if(_p < 1 && _q < 1) { // Neither p nor q specified + _q = localSize; // Assume a 1xq node-local grid + _p = 1; + } else if(_p < 1) { // q specified + if(localSize % _q != 0) { + if(rank == 0) + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "Node-local MPI grid cannot be split into q=%d columns", + _q); + MPI_Finalize(); + exit(1); + } + _p = localSize / _q; + } else if(_q < 1) { // p specified + if(localSize % _p != 0) { + if(rank == 0) + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "Node-local MPI grid cannot be split into p=%d rows", + _p); + MPI_Finalize(); + exit(1); + } + _q = localSize / _p; + } else { + if(localSize != _p * _q) { + if(rank == 0) + HPL_pwarn( + stderr, __LINE__, "HPL_pdinfo", "Invalid Node-local MPI grid"); + MPI_Finalize(); + exit(1); + } + } + + /*Check grid can be distributed to nodes*/ + if(_Q % _q != 0 || _P % _p != 0) { + if(rank == 0) + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "MPI grid is not uniformly distributed amoung nodes, " + "(P,Q)=(%d,%d) and (p,q)=(%d,%d)", + _P, + _Q, + _p, + _q); + MPI_Finalize(); + exit(1); + } + MPI_Comm_free(&nodeComm); + /* + * Node-local Process grids, mapping + */ + *p = _p; + *q = _q; + + if(inputfile == false && cmdlinerun == true) { + // We were given run paramters via the cmd line so skip + // trying to read from an input file and just fill a + // TEST structure. + + /* + * Problem size (>=0) (N) + */ + *NS = 1; + N[0] = n; + /* + * Block size (>=1) (NB) + */ + *NBS = 1; + NB[0] = nb; + /* + * Process grids, mapping, (>=1) (P, Q) + */ + *PMAPPIN = HPL_COLUMN_MAJOR; + *NPQS = 1; + P[0] = _P; + Q[0] = _Q; + /* + * Panel factorization algorithm (PF) + */ + *NPFS = 1; + PF[0] = HPL_RIGHT_LOOKING; // HPL_LEFT_LOOKING, HPL_CROUT; + /* + * Recursive stopping criterium (>=1) (NBM) + */ + *NBMS = 1; + NBM[0] = 16; + /* + * Number of panels in recursion (>=2) (NDV) + */ + *NDVS = 1; + NDV[0] = 2; + /* + * Recursive panel factorization (RF) + */ + *NRFS = 1; + RF[0] = HPL_RIGHT_LOOKING; // HPL_LEFT_LOOKING, HPL_CROUT; + /* + * Broadcast topology (TP) (0=rg, 1=2rg, 2=rgM, 3=2rgM, 4=L) + */ + *NTPS = 1; + TP[0] = HPL_1RING; + /* + * Lookahead depth (>=0) (NDH) + */ + *NDHS = 1; + DH[0] = 1; + /* + * Swapping algorithm (0,1 or 2) (FSWAP) + */ + *FSWAP = HPL_SWAP01; + /* + * Swapping threshold (>=0) (TSWAP) + */ + *TSWAP = 64; + /* + * L1 in (no-)transposed form (0 or 1) + */ + *L1NOTRAN = 1; + /* + * U in (no-)transposed form (0 or 1) + */ + *UNOTRAN = 0; + /* + * Equilibration (0=no, 1=yes) + */ + *EQUIL = 0; + /* + * Memory alignment in bytes (> 0) (ALIGN) + */ + *ALIGN = 8; + + /* + * Compute and broadcast machine epsilon + */ + TEST->epsil = HPL_pdlamch(MPI_COMM_WORLD, HPL_MACH_EPS); + + if(rank == 0) { + if((TEST->outfp = fopen("HPL.out", "w")) == NULL) { error = 1; } + } + (void)HPL_all_reduce((void*)(&error), 1, HPL_INT, HPL_MAX, MPI_COMM_WORLD); + if(error) { + if(rank == 0) + HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "cannot open file HPL.out."); + MPI_Finalize(); + exit(1); + } + } else { + /* + * Process 0 reads the input data, broadcasts to other processes and + * writes needed information to TEST->outfp. + */ + char* status; + if(rank == 0) { + /* + * Open file and skip data file header + */ + if((infp = fopen(inputFileName.c_str(), "r")) == NULL) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "cannot open file %s", + inputFileName.c_str()); + error = 1; + goto label_error; + } + + status = fgets(line, HPL_LINE_MAX - 2, infp); + status = fgets(auth, HPL_LINE_MAX - 2, infp); + /* + * Read name and unit number for summary output file + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", file); + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + fid = atoi(num); + if(fid == 6) + TEST->outfp = stdout; + else if(fid == 7) + TEST->outfp = stderr; + else if((TEST->outfp = fopen(file, "w")) == NULL) { + HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "cannot open file %s.", file); + error = 1; + goto label_error; + } + /* + * Read and check the parameter values for the tests. + * + * Problem size (>=0) (N) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *NS = atoi(num); + if((*NS < 1) || (*NS > HPL_MAX_PARAM)) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "%s %d", + "Number of values of N is less than 1 or greater than", + HPL_MAX_PARAM); + error = 1; + goto label_error; + } + + status = fgets(line, HPL_LINE_MAX - 2, infp); + lineptr = line; + for(i = 0; i < *NS; i++) { + (void)sscanf(lineptr, "%s", num); + lineptr += strlen(num) + 1; + if((N[i] = atoi(num)) < 0) { + HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "Value of N less than 0"); + error = 1; + goto label_error; + } + } + /* + * Block size (>=1) (NB) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *NBS = atoi(num); + if((*NBS < 1) || (*NBS > HPL_MAX_PARAM)) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "%s %s %d", + "Number of values of NB is less than 1 or", + "greater than", + HPL_MAX_PARAM); + error = 1; + goto label_error; + } + + status = fgets(line, HPL_LINE_MAX - 2, infp); + lineptr = line; + for(i = 0; i < *NBS; i++) { + (void)sscanf(lineptr, "%s", num); + lineptr += strlen(num) + 1; + if((NB[i] = atoi(num)) < 1) { + HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "Value of NB less than 1"); + error = 1; + goto label_error; + } + } + /* + * Process grids, mapping, (>=1) (P, Q) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *PMAPPIN = (atoi(num) == 1 ? HPL_COLUMN_MAJOR : HPL_ROW_MAJOR); + + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *NPQS = atoi(num); + if((*NPQS < 1) || (*NPQS > HPL_MAX_PARAM)) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "%s %s %d", + "Number of values of grids is less", + "than 1 or greater than", + HPL_MAX_PARAM); + error = 1; + goto label_error; + } + + status = fgets(line, HPL_LINE_MAX - 2, infp); + lineptr = line; + for(i = 0; i < *NPQS; i++) { + (void)sscanf(lineptr, "%s", num); + lineptr += strlen(num) + 1; + if((P[i] = atoi(num)) < 1) { + HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "Value of P less than 1"); + error = 1; + goto label_error; + } + } + status = fgets(line, HPL_LINE_MAX - 2, infp); + lineptr = line; + for(i = 0; i < *NPQS; i++) { + (void)sscanf(lineptr, "%s", num); + lineptr += strlen(num) + 1; + if((Q[i] = atoi(num)) < 1) { + HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "Value of Q less than 1"); + error = 1; + goto label_error; + } + } + /* + * Check for enough processes in machine configuration + */ + maxp = 0; + for(i = 0; i < *NPQS; i++) { + nprocs = P[i] * Q[i]; + maxp = Mmax(maxp, nprocs); + } + if(maxp > size) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "Need at least %d processes for these tests", + maxp); + error = 1; + goto label_error; + } + /* + * Checking threshold value (TEST->thrsh) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + TEST->thrsh = atof(num); + /* + * Panel factorization algorithm (PF) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *NPFS = atoi(num); + if((*NPFS < 1) || (*NPFS > HPL_MAX_PARAM)) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "%s %s %d", + "number of values of PFACT", + "is less than 1 or greater than", + HPL_MAX_PARAM); + error = 1; + goto label_error; + } + status = fgets(line, HPL_LINE_MAX - 2, infp); + lineptr = line; + for(i = 0; i < *NPFS; i++) { + (void)sscanf(lineptr, "%s", num); + lineptr += strlen(num) + 1; + j = atoi(num); + if(j == 0) + PF[i] = HPL_LEFT_LOOKING; + else if(j == 1) + PF[i] = HPL_CROUT; + else if(j == 2) + PF[i] = HPL_RIGHT_LOOKING; + else + PF[i] = HPL_RIGHT_LOOKING; + } + /* + * Recursive stopping criterium (>=1) (NBM) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *NBMS = atoi(num); + if((*NBMS < 1) || (*NBMS > HPL_MAX_PARAM)) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "%s %s %d", + "Number of values of NBMIN", + "is less than 1 or greater than", + HPL_MAX_PARAM); + error = 1; + goto label_error; + } + status = fgets(line, HPL_LINE_MAX - 2, infp); + lineptr = line; + for(i = 0; i < *NBMS; i++) { + (void)sscanf(lineptr, "%s", num); + lineptr += strlen(num) + 1; + if((NBM[i] = atoi(num)) < 1) { + HPL_pwarn( + stderr, __LINE__, "HPL_pdinfo", "Value of NBMIN less than 1"); + error = 1; + goto label_error; + } + } + /* + * Number of panels in recursion (>=2) (NDV) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *NDVS = atoi(num); + if((*NDVS < 1) || (*NDVS > HPL_MAX_PARAM)) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "%s %s %d", + "Number of values of NDIV", + "is less than 1 or greater than", + HPL_MAX_PARAM); + error = 1; + goto label_error; + } + status = fgets(line, HPL_LINE_MAX - 2, infp); + lineptr = line; + for(i = 0; i < *NDVS; i++) { + (void)sscanf(lineptr, "%s", num); + lineptr += strlen(num) + 1; + if((NDV[i] = atoi(num)) < 2) { + HPL_pwarn( + stderr, __LINE__, "HPL_pdinfo", "Value of NDIV less than 2"); + error = 1; + goto label_error; + } + } + /* + * Recursive panel factorization (RF) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *NRFS = atoi(num); + if((*NRFS < 1) || (*NRFS > HPL_MAX_PARAM)) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "%s %s %d", + "Number of values of RFACT", + "is less than 1 or greater than", + HPL_MAX_PARAM); + error = 1; + goto label_error; + } + status = fgets(line, HPL_LINE_MAX - 2, infp); + lineptr = line; + for(i = 0; i < *NRFS; i++) { + (void)sscanf(lineptr, "%s", num); + lineptr += strlen(num) + 1; + j = atoi(num); + if(j == 0) + RF[i] = HPL_LEFT_LOOKING; + else if(j == 1) + RF[i] = HPL_CROUT; + else if(j == 2) + RF[i] = HPL_RIGHT_LOOKING; + else + RF[i] = HPL_RIGHT_LOOKING; + } + /* + * Broadcast topology (TP) (0=rg, 1=2rg, 2=rgM, 3=2rgM, 4=L) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *NTPS = atoi(num); + if((*NTPS < 1) || (*NTPS > HPL_MAX_PARAM)) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "%s %s %d", + "Number of values of BCAST", + "is less than 1 or greater than", + HPL_MAX_PARAM); + error = 1; + goto label_error; + } + status = fgets(line, HPL_LINE_MAX - 2, infp); + lineptr = line; + for(i = 0; i < *NTPS; i++) { + (void)sscanf(lineptr, "%s", num); + lineptr += strlen(num) + 1; + j = atoi(num); + if(j == 0) + TP[i] = HPL_1RING; + else if(j == 1) + TP[i] = HPL_1RING_M; + else if(j == 2) + TP[i] = HPL_2RING; + else if(j == 3) + TP[i] = HPL_2RING_M; + else if(j == 4) + TP[i] = HPL_BLONG; + else // if(j == 5) + TP[i] = HPL_BLONG_M; + } + /* + * Lookahead depth (>=0) (NDH) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *NDHS = atoi(num); + if((*NDHS < 1) || (*NDHS > HPL_MAX_PARAM)) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "%s %s %d", + "Number of values of DEPTH", + "is less than 1 or greater than", + HPL_MAX_PARAM); + error = 1; + goto label_error; + } + status = fgets(line, HPL_LINE_MAX - 2, infp); + lineptr = line; + for(i = 0; i < *NDHS; i++) { + (void)sscanf(lineptr, "%s", num); + lineptr += strlen(num) + 1; + if((DH[i] = atoi(num)) < 0) { + HPL_pwarn( + stderr, __LINE__, "HPL_pdinfo", "Value of DEPTH less than 0"); + error = 1; + goto label_error; + } + // NC: We require lookahead depth of 1 + if(DH[i] != 1) { + HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "Value of DEPTH must be 1"); + error = 1; + goto label_error; + } + } + /* + * Swapping algorithm (0,1 or 2) (FSWAP) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + j = atoi(num); + if(j == 0) + *FSWAP = HPL_SWAP00; + else if(j == 1) + *FSWAP = HPL_SWAP01; + else if(j == 2) + *FSWAP = HPL_SW_MIX; + else + *FSWAP = HPL_SWAP01; + // NC: Only one rowswapping algorithm implemented + if(*FSWAP != HPL_SWAP01) { + HPL_pwarn(stderr, __LINE__, "HPL_pdinfo", "Value of SWAP must be 1"); + error = 1; + goto label_error; + } + /* + * Swapping threshold (>=0) (TSWAP) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *TSWAP = atoi(num); + if(*TSWAP <= 0) *TSWAP = 0; + /* + * L1 in (no-)transposed form (0 or 1) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *L1NOTRAN = atoi(num); + if((*L1NOTRAN != 0) && (*L1NOTRAN != 1)) *L1NOTRAN = 0; + /* + * U in (no-)transposed form (0 or 1) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *UNOTRAN = atoi(num); + if((*UNOTRAN != 0) && (*UNOTRAN != 1)) *UNOTRAN = 0; + + // NC: We don't support holding U in no-transpose form anymore + if(*UNOTRAN != 0) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "U in no-transposed form unsupported"); + error = 1; + goto label_error; + } + /* + * Equilibration (0=no, 1=yes) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *EQUIL = atoi(num); + if((*EQUIL != 0) && (*EQUIL != 1)) *EQUIL = 1; + + // NC: We don't currently support Equilibration + if(*EQUIL != 0) { + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "Equilibration currently unsupported"); + error = 1; + goto label_error; + } + /* + * Memory alignment in bytes (> 0) (ALIGN) + */ + status = fgets(line, HPL_LINE_MAX - 2, infp); + (void)sscanf(line, "%s", num); + *ALIGN = atoi(num); + if(*ALIGN <= 0) *ALIGN = 4; + + /* + * Close input file + */ + label_error: + (void)fclose(infp); + } else { + TEST->outfp = NULL; + } + + /* + * Check for error on reading input file + */ + (void)HPL_all_reduce((void*)(&error), 1, HPL_INT, HPL_MAX, MPI_COMM_WORLD); + if(error) { + if(rank == 0) + HPL_pwarn(stderr, + __LINE__, + "HPL_pdinfo", + "Illegal input in file HPL.dat. Exiting ..."); + MPI_Finalize(); + exit(1); + } + /* + * Compute and broadcast machine epsilon + */ + TEST->epsil = HPL_pdlamch(MPI_COMM_WORLD, HPL_MACH_EPS); + /* + * Pack information arrays and broadcast + */ + (void)HPL_broadcast( + (void*)(&(TEST->thrsh)), 1, HPL_DOUBLE, 0, MPI_COMM_WORLD); + /* + * Broadcast array sizes + */ + iwork = (int*)malloc((size_t)(15) * sizeof(int)); + if(rank == 0) { + iwork[0] = *NS; + iwork[1] = *NBS; + iwork[2] = (*PMAPPIN == HPL_ROW_MAJOR ? 0 : 1); + iwork[3] = *NPQS; + iwork[4] = *NPFS; + iwork[5] = *NBMS; + iwork[6] = *NDVS; + iwork[7] = *NRFS; + iwork[8] = *NTPS; + iwork[9] = *NDHS; + iwork[10] = *TSWAP; + iwork[11] = *L1NOTRAN; + iwork[12] = *UNOTRAN; + iwork[13] = *EQUIL; + iwork[14] = *ALIGN; + } + (void)HPL_broadcast((void*)iwork, 15, HPL_INT, 0, MPI_COMM_WORLD); + if(rank != 0) { + *NS = iwork[0]; + *NBS = iwork[1]; + *PMAPPIN = (iwork[2] == 0 ? HPL_ROW_MAJOR : HPL_COLUMN_MAJOR); + *NPQS = iwork[3]; + *NPFS = iwork[4]; + *NBMS = iwork[5]; + *NDVS = iwork[6]; + *NRFS = iwork[7]; + *NTPS = iwork[8]; + *NDHS = iwork[9]; + *TSWAP = iwork[10]; + *L1NOTRAN = iwork[11]; + *UNOTRAN = iwork[12]; + *EQUIL = iwork[13]; + *ALIGN = iwork[14]; + } + if(iwork) free(iwork); + /* + * Pack information arrays and broadcast + */ + lwork = (*NS) + (*NBS) + 2 * (*NPQS) + (*NPFS) + (*NBMS) + (*NDVS) + + (*NRFS) + (*NTPS) + (*NDHS) + 1; + iwork = (int*)malloc((size_t)(lwork) * sizeof(int)); + if(rank == 0) { + j = 0; + for(i = 0; i < *NS; i++) { + iwork[j] = N[i]; + j++; + } + for(i = 0; i < *NBS; i++) { + iwork[j] = NB[i]; + j++; + } + for(i = 0; i < *NPQS; i++) { + iwork[j] = P[i]; + j++; + } + for(i = 0; i < *NPQS; i++) { + iwork[j] = Q[i]; + j++; + } + for(i = 0; i < *NPFS; i++) { + if(PF[i] == HPL_LEFT_LOOKING) + iwork[j] = 0; + else if(PF[i] == HPL_CROUT) + iwork[j] = 1; + else if(PF[i] == HPL_RIGHT_LOOKING) + iwork[j] = 2; + j++; + } + for(i = 0; i < *NBMS; i++) { + iwork[j] = NBM[i]; + j++; + } + for(i = 0; i < *NDVS; i++) { + iwork[j] = NDV[i]; + j++; + } + for(i = 0; i < *NRFS; i++) { + if(RF[i] == HPL_LEFT_LOOKING) + iwork[j] = 0; + else if(RF[i] == HPL_CROUT) + iwork[j] = 1; + else if(RF[i] == HPL_RIGHT_LOOKING) + iwork[j] = 2; + j++; + } + for(i = 0; i < *NTPS; i++) { + if(TP[i] == HPL_1RING) + iwork[j] = 0; + else if(TP[i] == HPL_1RING_M) + iwork[j] = 1; + else if(TP[i] == HPL_2RING) + iwork[j] = 2; + else if(TP[i] == HPL_2RING_M) + iwork[j] = 3; + else if(TP[i] == HPL_BLONG) + iwork[j] = 4; + else if(TP[i] == HPL_BLONG_M) + iwork[j] = 5; + j++; + } + for(i = 0; i < *NDHS; i++) { + iwork[j] = DH[i]; + j++; + } + + if(*FSWAP == HPL_SWAP00) + iwork[j] = 0; + else if(*FSWAP == HPL_SWAP01) + iwork[j] = 1; + else if(*FSWAP == HPL_SW_MIX) + iwork[j] = 2; + j++; + } + (void)HPL_broadcast((void*)iwork, lwork, HPL_INT, 0, MPI_COMM_WORLD); + if(rank != 0) { + j = 0; + for(i = 0; i < *NS; i++) { + N[i] = iwork[j]; + j++; + } + for(i = 0; i < *NBS; i++) { + NB[i] = iwork[j]; + j++; + } + for(i = 0; i < *NPQS; i++) { + P[i] = iwork[j]; + j++; + } + for(i = 0; i < *NPQS; i++) { + Q[i] = iwork[j]; + j++; + } + + for(i = 0; i < *NPFS; i++) { + if(iwork[j] == 0) + PF[i] = HPL_LEFT_LOOKING; + else if(iwork[j] == 1) + PF[i] = HPL_CROUT; + else if(iwork[j] == 2) + PF[i] = HPL_RIGHT_LOOKING; + j++; + } + for(i = 0; i < *NBMS; i++) { + NBM[i] = iwork[j]; + j++; + } + for(i = 0; i < *NDVS; i++) { + NDV[i] = iwork[j]; + j++; + } + for(i = 0; i < *NRFS; i++) { + if(iwork[j] == 0) + RF[i] = HPL_LEFT_LOOKING; + else if(iwork[j] == 1) + RF[i] = HPL_CROUT; + else if(iwork[j] == 2) + RF[i] = HPL_RIGHT_LOOKING; + j++; + } + for(i = 0; i < *NTPS; i++) { + if(iwork[j] == 0) + TP[i] = HPL_1RING; + else if(iwork[j] == 1) + TP[i] = HPL_1RING_M; + else if(iwork[j] == 2) + TP[i] = HPL_2RING; + else if(iwork[j] == 3) + TP[i] = HPL_2RING_M; + else if(iwork[j] == 4) + TP[i] = HPL_BLONG; + else if(iwork[j] == 5) + TP[i] = HPL_BLONG_M; + j++; + } + for(i = 0; i < *NDHS; i++) { + DH[i] = iwork[j]; + j++; + } + + if(iwork[j] == 0) + *FSWAP = HPL_SWAP00; + else if(iwork[j] == 1) + *FSWAP = HPL_SWAP01; + else if(iwork[j] == 2) + *FSWAP = HPL_SW_MIX; + j++; + } + if(iwork) free(iwork); + } + + /* + * regurgitate input + */ + if(rank == 0) { + HPL_fprintf(TEST->outfp, + "%s%s\n", + "========================================", + "========================================"); + HPL_fprintf(TEST->outfp, + "%s%s\n", + "HPLinpack 2.2 -- High-Performance Linpack benchmark -- ", + " February 24, 2016"); + HPL_fprintf(TEST->outfp, + "%s%s\n", + "Written by A. Petitet and R. Clint Whaley, ", + "Innovative Computing Laboratory, UTK"); + HPL_fprintf(TEST->outfp, + "%s%s\n", + "Modified by Piotr Luszczek, ", + "Innovative Computing Laboratory, UTK"); + HPL_fprintf(TEST->outfp, + "%s%s\n", + "Modified by Julien Langou, ", + "University of Colorado Denver"); + HPL_fprintf(TEST->outfp, + "%s%s\n", + "========================================", + "========================================"); + + HPL_fprintf(TEST->outfp, + "\n%s\n", + "An explanation of the input/output parameters follows:"); + HPL_fprintf(TEST->outfp, "%s\n", "T/V : Wall time / encoded variant."); + HPL_fprintf( + TEST->outfp, "%s\n", "N : The order of the coefficient matrix A."); + HPL_fprintf( + TEST->outfp, "%s\n", "NB : The partitioning blocking factor."); + HPL_fprintf(TEST->outfp, "%s\n", "P : The number of process rows."); + HPL_fprintf(TEST->outfp, "%s\n", "Q : The number of process columns."); + HPL_fprintf(TEST->outfp, + "%s\n", + "Time : Time in seconds to solve the linear system."); + HPL_fprintf(TEST->outfp, + "%s\n\n", + "Gflops : Rate of execution for solving the linear system."); + HPL_fprintf( + TEST->outfp, "%s\n", "The following parameter values will be used:"); + /* + * Problem size + */ + HPL_fprintf(TEST->outfp, "\nN :"); + for(i = 0; i < Mmin(8, *NS); i++) HPL_fprintf(TEST->outfp, "%8d ", N[i]); + if(*NS > 8) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 8; i < Mmin(16, *NS); i++) HPL_fprintf(TEST->outfp, "%8d ", N[i]); + if(*NS > 16) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 16; i < *NS; i++) HPL_fprintf(TEST->outfp, "%8d ", N[i]); + } + } + /* + * Distribution blocking factor + */ + HPL_fprintf(TEST->outfp, "\nNB :"); + for(i = 0; i < Mmin(8, *NBS); i++) HPL_fprintf(TEST->outfp, "%8d ", NB[i]); + if(*NBS > 8) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 8; i < Mmin(16, *NBS); i++) + HPL_fprintf(TEST->outfp, "%8d ", NB[i]); + if(*NBS > 16) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 16; i < *NBS; i++) HPL_fprintf(TEST->outfp, "%8d ", NB[i]); + } + } + /* + * Process mapping + */ + HPL_fprintf(TEST->outfp, "\nPMAP :"); + if(*PMAPPIN == HPL_ROW_MAJOR) + HPL_fprintf(TEST->outfp, " Row-major process mapping"); + else if(*PMAPPIN == HPL_COLUMN_MAJOR) + HPL_fprintf(TEST->outfp, " Column-major process mapping"); + /* + * Process grid + */ + HPL_fprintf(TEST->outfp, "\nP :"); + for(i = 0; i < Mmin(8, *NPQS); i++) HPL_fprintf(TEST->outfp, "%8d ", P[i]); + if(*NPQS > 8) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 8; i < Mmin(16, *NPQS); i++) + HPL_fprintf(TEST->outfp, "%8d ", P[i]); + if(*NPQS > 16) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 16; i < *NPQS; i++) HPL_fprintf(TEST->outfp, "%8d ", P[i]); + } + } + HPL_fprintf(TEST->outfp, "\nQ :"); + for(i = 0; i < Mmin(8, *NPQS); i++) HPL_fprintf(TEST->outfp, "%8d ", Q[i]); + if(*NPQS > 8) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 8; i < Mmin(16, *NPQS); i++) + HPL_fprintf(TEST->outfp, "%8d ", Q[i]); + if(*NPQS > 16) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 16; i < *NPQS; i++) HPL_fprintf(TEST->outfp, "%8d ", Q[i]); + } + } + /* + * Panel Factorization + */ + HPL_fprintf(TEST->outfp, "\nPFACT :"); + for(i = 0; i < Mmin(8, *NPFS); i++) { + if(PF[i] == HPL_LEFT_LOOKING) + HPL_fprintf(TEST->outfp, " Left "); + else if(PF[i] == HPL_CROUT) + HPL_fprintf(TEST->outfp, " Crout "); + else if(PF[i] == HPL_RIGHT_LOOKING) + HPL_fprintf(TEST->outfp, " Right "); + } + if(*NPFS > 8) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 8; i < Mmin(16, *NPFS); i++) { + if(PF[i] == HPL_LEFT_LOOKING) + HPL_fprintf(TEST->outfp, " Left "); + else if(PF[i] == HPL_CROUT) + HPL_fprintf(TEST->outfp, " Crout "); + else if(PF[i] == HPL_RIGHT_LOOKING) + HPL_fprintf(TEST->outfp, " Right "); + } + if(*NPFS > 16) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 16; i < *NPFS; i++) { + if(PF[i] == HPL_LEFT_LOOKING) + HPL_fprintf(TEST->outfp, " Left "); + else if(PF[i] == HPL_CROUT) + HPL_fprintf(TEST->outfp, " Crout "); + else if(PF[i] == HPL_RIGHT_LOOKING) + HPL_fprintf(TEST->outfp, " Right "); + } + } + } + /* + * Recursive stopping criterium + */ + HPL_fprintf(TEST->outfp, "\nNBMIN :"); + for(i = 0; i < Mmin(8, *NBMS); i++) + HPL_fprintf(TEST->outfp, "%8d ", NBM[i]); + if(*NBMS > 8) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 8; i < Mmin(16, *NBMS); i++) + HPL_fprintf(TEST->outfp, "%8d ", NBM[i]); + if(*NBMS > 16) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 16; i < *NBMS; i++) HPL_fprintf(TEST->outfp, "%8d ", NBM[i]); + } + } + /* + * Number of panels in recursion + */ + HPL_fprintf(TEST->outfp, "\nNDIV :"); + for(i = 0; i < Mmin(8, *NDVS); i++) + HPL_fprintf(TEST->outfp, "%8d ", NDV[i]); + if(*NDVS > 8) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 8; i < Mmin(16, *NDVS); i++) + HPL_fprintf(TEST->outfp, "%8d ", NDV[i]); + if(*NDVS > 16) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 16; i < *NDVS; i++) HPL_fprintf(TEST->outfp, "%8d ", NDV[i]); + } + } + /* + * Recursive Factorization + */ + HPL_fprintf(TEST->outfp, "\nRFACT :"); + for(i = 0; i < Mmin(8, *NRFS); i++) { + if(RF[i] == HPL_LEFT_LOOKING) + HPL_fprintf(TEST->outfp, " Left "); + else if(RF[i] == HPL_CROUT) + HPL_fprintf(TEST->outfp, " Crout "); + else if(RF[i] == HPL_RIGHT_LOOKING) + HPL_fprintf(TEST->outfp, " Right "); + } + if(*NRFS > 8) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 8; i < Mmin(16, *NRFS); i++) { + if(RF[i] == HPL_LEFT_LOOKING) + HPL_fprintf(TEST->outfp, " Left "); + else if(RF[i] == HPL_CROUT) + HPL_fprintf(TEST->outfp, " Crout "); + else if(RF[i] == HPL_RIGHT_LOOKING) + HPL_fprintf(TEST->outfp, " Right "); + } + if(*NRFS > 16) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 16; i < *NRFS; i++) { + if(RF[i] == HPL_LEFT_LOOKING) + HPL_fprintf(TEST->outfp, " Left "); + else if(RF[i] == HPL_CROUT) + HPL_fprintf(TEST->outfp, " Crout "); + else if(RF[i] == HPL_RIGHT_LOOKING) + HPL_fprintf(TEST->outfp, " Right "); + } + } + } + /* + * Broadcast topology + */ + HPL_fprintf(TEST->outfp, "\nBCAST :"); + for(i = 0; i < Mmin(8, *NTPS); i++) { + if(TP[i] == HPL_1RING) + HPL_fprintf(TEST->outfp, " 1ring "); + else if(TP[i] == HPL_1RING_M) + HPL_fprintf(TEST->outfp, " 1ringM "); + else if(TP[i] == HPL_2RING) + HPL_fprintf(TEST->outfp, " 2ring "); + else if(TP[i] == HPL_2RING_M) + HPL_fprintf(TEST->outfp, " 2ringM "); + else if(TP[i] == HPL_BLONG) + HPL_fprintf(TEST->outfp, " Blong "); + else if(TP[i] == HPL_BLONG_M) + HPL_fprintf(TEST->outfp, " BlongM "); + } + if(*NTPS > 8) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 8; i < Mmin(16, *NTPS); i++) { + if(TP[i] == HPL_1RING) + HPL_fprintf(TEST->outfp, " 1ring "); + else if(TP[i] == HPL_1RING_M) + HPL_fprintf(TEST->outfp, " 1ringM "); + else if(TP[i] == HPL_2RING) + HPL_fprintf(TEST->outfp, " 2ring "); + else if(TP[i] == HPL_2RING_M) + HPL_fprintf(TEST->outfp, " 2ringM "); + else if(TP[i] == HPL_BLONG) + HPL_fprintf(TEST->outfp, " Blong "); + else if(TP[i] == HPL_BLONG_M) + HPL_fprintf(TEST->outfp, " BlongM "); + } + if(*NTPS > 16) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 16; i < *NTPS; i++) { + if(TP[i] == HPL_1RING) + HPL_fprintf(TEST->outfp, " 1ring "); + else if(TP[i] == HPL_1RING_M) + HPL_fprintf(TEST->outfp, " 1ringM "); + else if(TP[i] == HPL_2RING) + HPL_fprintf(TEST->outfp, " 2ring "); + else if(TP[i] == HPL_2RING_M) + HPL_fprintf(TEST->outfp, " 2ringM "); + else if(TP[i] == HPL_BLONG) + HPL_fprintf(TEST->outfp, " Blong "); + else if(TP[i] == HPL_BLONG_M) + HPL_fprintf(TEST->outfp, " BlongM "); + } + } + } + /* + * Lookahead depths + */ + HPL_fprintf(TEST->outfp, "\nDEPTH :"); + for(i = 0; i < Mmin(8, *NDHS); i++) HPL_fprintf(TEST->outfp, "%8d ", DH[i]); + if(*NDHS > 8) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 8; i < Mmin(16, *NDHS); i++) + HPL_fprintf(TEST->outfp, "%8d ", DH[i]); + if(*NDHS > 16) { + HPL_fprintf(TEST->outfp, "\n "); + for(i = 16; i < *NDHS; i++) HPL_fprintf(TEST->outfp, "%8d ", DH[i]); + } + } + /* + * Swapping algorithm + */ + HPL_fprintf(TEST->outfp, "\nSWAP :"); + if(*FSWAP == HPL_SWAP00) + HPL_fprintf(TEST->outfp, " Binary-exchange"); + else if(*FSWAP == HPL_SWAP01) + HPL_fprintf(TEST->outfp, " Spread-roll (long)"); + else if(*FSWAP == HPL_SW_MIX) + HPL_fprintf(TEST->outfp, " Mix (threshold = %d)", *TSWAP); + /* + * L1 storage form + */ + HPL_fprintf(TEST->outfp, "\nL1 :"); + if(*L1NOTRAN != 0) + HPL_fprintf(TEST->outfp, " no-transposed form"); + else + HPL_fprintf(TEST->outfp, " transposed form"); + /* + * U storage form + */ + HPL_fprintf(TEST->outfp, "\nU :"); + if(*UNOTRAN != 0) + HPL_fprintf(TEST->outfp, " no-transposed form"); + else + HPL_fprintf(TEST->outfp, " transposed form"); + /* + * Equilibration + */ + HPL_fprintf(TEST->outfp, "\nEQUIL :"); + if(*EQUIL != 0) + HPL_fprintf(TEST->outfp, " yes"); + else + HPL_fprintf(TEST->outfp, " no"); + /* + * Alignment + */ + HPL_fprintf(TEST->outfp, "\nALIGN : %d double precision words", *ALIGN); + + HPL_fprintf(TEST->outfp, "\n\n"); + /* + * For testing only + */ + if(TEST->thrsh > HPL_rzero) { + HPL_fprintf(TEST->outfp, + "%s%s\n\n", + "----------------------------------------", + "----------------------------------------"); + HPL_fprintf(TEST->outfp, + "%s\n", + "- The matrix A is randomly generated for each test."); + HPL_fprintf(TEST->outfp, + "%s\n", + "- The following scaled residual check will be computed:"); + HPL_fprintf(TEST->outfp, + "%s\n", + " ||Ax-b||_oo / ( eps * ( || x ||_oo * || A ||_oo + || " + "b ||_oo ) * N )"); + HPL_fprintf(TEST->outfp, + "%s %21.6e\n", + "- The relative machine precision (eps) is taken to be ", + TEST->epsil); + HPL_fprintf( + TEST->outfp, + "%s %11.1f\n\n", + "- Computational tests pass if scaled residuals are less than ", + TEST->thrsh); + } + } +} diff --git a/src/HPL_pdtest.cpp b/src/HPL_pdtest.cpp new file mode 100644 index 0000000..afdda39 --- /dev/null +++ b/src/HPL_pdtest.cpp @@ -0,0 +1,501 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include +#include "hpl.hpp" + +void HPL_pdtest(HPL_T_test* TEST, + HPL_T_grid* GRID, + HPL_T_palg* ALGO, + const int N, + const int NB) { +/* + * Purpose + * ======= + * + * HPL_pdtest performs one test given a set of parameters such as the + * process grid, the problem size, the distribution blocking factor ... + * This function generates the data, calls and times the linear system + * solver, checks the accuracy of the obtained vector solution and + * writes this information to the file pointed to by TEST->outfp. + * + * Arguments + * ========= + * + * TEST (global input) HPL_T_test * + * On entry, TEST points to a testing data structure: outfp + * specifies the output file where the results will be printed. + * It is only defined and used by the process 0 of the grid. + * thrsh specifies the threshhold value for the test ratio. + * Concretely, a test is declared "PASSED" if and only if the + * following inequality is satisfied: + * ||Ax-b||_oo / ( epsil * + * ( || x ||_oo * || A ||_oo + || b ||_oo ) * + * N ) < thrsh. + * epsil is the relative machine precision of the distributed + * computer. Finally the test counters, kfail, kpass, kskip and + * ktest are updated as follows: if the test passes, kpass is + * incremented by one; if the test fails, kfail is incremented + * by one; if the test is skipped, kskip is incremented by one. + * ktest is left unchanged. + * + * GRID (local input) HPL_T_grid * + * On entry, GRID points to the data structure containing the + * process grid information. + * + * ALGO (global input) HPL_T_palg * + * On entry, ALGO points to the data structure containing the + * algorithmic parameters to be used for this test. + * + * N (global input) const int + * On entry, N specifies the order of the coefficient matrix A. + * N must be at least zero. + * + * NB (global input) const int + * On entry, NB specifies the blocking factor used to partition + * and distribute the matrix A. NB must be larger than one. + * + * --------------------------------------------------------------------- + */ +/* + * .. Local Variables .. + */ +#ifdef HPL_DETAILED_TIMING + double HPL_w[HPL_TIMING_N]; +#endif + HPL_T_pmat mat; + double wtime[1]; + int ierr; + double Anorm1, AnormI, Gflops, Xnorm1, XnormI, BnormI, resid0, resid1; + double* Bptr; + double* dBptr; + static int first = 1; + int ii, ip2, mycol, myrow, npcol, nprow, nq; + char ctop, cpfact, crfact; + time_t current_time_start, current_time_end; + + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + + /* + * Allocate matrix, right-hand-side, and vector solution x. [ A | b ] is + * N by N+1. One column is added in every process column for the solve. + * The result however is stored in a 1 x N vector replicated in every + * process row. In every process, A is lda * (nq+1), x is 1 * nq and the + * workspace is mp. + */ + ierr = HPL_pdmatgen(TEST, GRID, ALGO, &mat, N, NB); + + if(ierr != HPL_SUCCESS) { + (TEST->kskip)++; + HPL_pdmatfree(&mat); + return; + } + + /* Create row-swapping data type */ + MPI_Type_contiguous(NB + 4, MPI_DOUBLE, &PDFACT_ROW); + MPI_Type_commit(&PDFACT_ROW); + + /* + * generate matrix and right-hand-side, [ A | b ] which is N by N+1. + */ + HPL_pdrandmat(GRID, N, N + 1, NB, mat.dA, mat.ld, HPL_ISEED); + + /* + * Solve linear system + */ + HPL_ptimer_boot(); + (void)HPL_barrier(GRID->all_comm); + time(¤t_time_start); + HPL_ptimer(0); + HPL_pdgesv(GRID, ALGO, &mat); + HPL_ptimer(0); + time(¤t_time_end); + + /* + * Gather max of all CPU and WALL clock timings and print timing results + */ + HPL_ptimer_combine( + GRID->all_comm, HPL_AMAX_PTIME, HPL_WALL_PTIME, 1, 0, wtime); + + if((myrow == 0) && (mycol == 0)) { + if(first) { + HPL_fprintf(TEST->outfp, + "%s%s\n", + "========================================", + "========================================"); + HPL_fprintf(TEST->outfp, + "%s%s\n", + "T/V N NB P Q", + " Time Gflops"); + HPL_fprintf(TEST->outfp, + "%s%s\n", + "----------------------------------------", + "----------------------------------------"); + if(TEST->thrsh <= HPL_rzero) first = 0; + } + /* + * 2/3 N^3 - 1/2 N^2 flops for LU factorization + 2 N^2 flops for solve. + * Print WALL time + */ + Gflops = (((double)(N) / 1.0e+9) * ((double)(N) / wtime[0])) * + ((2.0 / 3.0) * (double)(N) + (3.0 / 2.0)); + + cpfact = (((HPL_T_FACT)(ALGO->pfact) == (HPL_T_FACT)(HPL_LEFT_LOOKING)) + ? (char)('L') + : (((HPL_T_FACT)(ALGO->pfact) == (HPL_T_FACT)(HPL_CROUT)) + ? (char)('C') + : (char)('R'))); + crfact = (((HPL_T_FACT)(ALGO->rfact) == (HPL_T_FACT)(HPL_LEFT_LOOKING)) + ? (char)('L') + : (((HPL_T_FACT)(ALGO->rfact) == (HPL_T_FACT)(HPL_CROUT)) + ? (char)('C') + : (char)('R'))); + + if(ALGO->btopo == HPL_1RING) + ctop = '0'; + else if(ALGO->btopo == HPL_1RING_M) + ctop = '1'; + else if(ALGO->btopo == HPL_2RING) + ctop = '2'; + else if(ALGO->btopo == HPL_2RING_M) + ctop = '3'; + else if(ALGO->btopo == HPL_BLONG) + ctop = '4'; + else /* if( ALGO->btopo == HPL_BLONG_M ) */ + ctop = '5'; + + if(wtime[0] > HPL_rzero) { + HPL_fprintf(TEST->outfp, + "W%c%1d%c%c%1d%c%1d%12d %5d %5d %5d %18.2f %18.3e\n", + (GRID->order == HPL_ROW_MAJOR ? 'R' : 'C'), + ALGO->depth, + ctop, + crfact, + ALGO->nbdiv, + cpfact, + ALGO->nbmin, + N, + NB, + nprow, + npcol, + wtime[0], + Gflops); + HPL_fprintf(TEST->outfp, + "HPL_pdgesv() start time %s\n", + ctime(¤t_time_start)); + HPL_fprintf(TEST->outfp, + "HPL_pdgesv() end time %s\n", + ctime(¤t_time_end)); + } +#ifdef HPL_PROGRESS_REPORT + printf("Final Score: %7.4e GFLOPS \n", Gflops); +#endif + } +#ifdef HPL_DETAILED_TIMING + HPL_ptimer_combine(GRID->all_comm, + HPL_AMAX_PTIME, + HPL_WALL_PTIME, + HPL_TIMING_N, + HPL_TIMING_BEG, + HPL_w); + if((myrow == 0) && (mycol == 0)) { + HPL_fprintf(TEST->outfp, + "%s%s\n", + "--VVV--VVV--VVV--VVV--VVV--VVV--VVV--V", + "VV--VVV--VVV--VVV--VVV--VVV--VVV--VVV-"); + /* + * Lbcast + */ + if(HPL_w[HPL_TIMING_LBCAST - HPL_TIMING_BEG] > HPL_rzero) + HPL_fprintf(TEST->outfp, + "Max aggregated wall time bcast . . . : %18.2f\n", + HPL_w[HPL_TIMING_LBCAST - HPL_TIMING_BEG]); + /* + * Panel copy + */ + if(HPL_w[HPL_TIMING_COPY - HPL_TIMING_BEG] > HPL_rzero) + HPL_fprintf(TEST->outfp, + "+ Max aggregated wall time panel copy: %18.2f\n", + HPL_w[HPL_TIMING_MXSWP - HPL_TIMING_BEG]); + /* + * Recursive panel factorization + */ + if(HPL_w[HPL_TIMING_RPFACT - HPL_TIMING_BEG] > HPL_rzero) + HPL_fprintf(TEST->outfp, + "+ Max aggregated wall time rfact . . : %18.2f\n", + HPL_w[HPL_TIMING_RPFACT - HPL_TIMING_BEG]); + /* + * Panel factorization + */ + if(HPL_w[HPL_TIMING_PFACT - HPL_TIMING_BEG] > HPL_rzero) + HPL_fprintf(TEST->outfp, + "+ + Max aggregated wall time pfact . : %18.2f\n", + HPL_w[HPL_TIMING_PFACT - HPL_TIMING_BEG]); + /* + * Panel factorization (swap) + */ + if(HPL_w[HPL_TIMING_MXSWP - HPL_TIMING_BEG] > HPL_rzero) + HPL_fprintf(TEST->outfp, + "+ + Max aggregated wall time mxswp . : %18.2f\n", + HPL_w[HPL_TIMING_MXSWP - HPL_TIMING_BEG]); + /* + * Update (swap) + */ + if(HPL_w[HPL_TIMING_LASWP - HPL_TIMING_BEG] > HPL_rzero) + HPL_fprintf(TEST->outfp, + "Max aggregated wall time laswp . . . : %18.2f\n", + HPL_w[HPL_TIMING_LASWP - HPL_TIMING_BEG]); + /* + * Update + */ + if(HPL_w[HPL_TIMING_UPDATE - HPL_TIMING_BEG] > HPL_rzero) + HPL_fprintf(TEST->outfp, + "Max aggregated wall time update . . : %18.2f\n", + HPL_w[HPL_TIMING_UPDATE - HPL_TIMING_BEG]); + /* + * Upper triangular system solve + */ + if(HPL_w[HPL_TIMING_PTRSV - HPL_TIMING_BEG] > HPL_rzero) + HPL_fprintf(TEST->outfp, + "Max aggregated wall time up tr sv . : %18.2f\n", + HPL_w[HPL_TIMING_PTRSV - HPL_TIMING_BEG]); + + if(TEST->thrsh <= HPL_rzero) + HPL_fprintf(TEST->outfp, + "%s%s\n", + "========================================", + "========================================"); + } +#endif + + /* Release row swapping datatype */ + MPI_Type_free(&PDFACT_ROW); + + /* + * Quick return, if I am not interested in checking the computations + */ + if(TEST->thrsh <= HPL_rzero) { + (TEST->kpass)++; + HPL_pdmatfree(&mat); + return; + } + /* + * Check info returned by solve + */ + if(mat.info != 0) { + if((myrow == 0) && (mycol == 0)) + HPL_pwarn(TEST->outfp, + __LINE__, + "HPL_pdtest", + "%s %d, %s", + "Error code returned by solve is", + mat.info, + "skip"); + (TEST->kskip)++; + HPL_pdmatfree(&mat); + return; + } + /* + * Check computation, re-generate [ A | b ], compute norm 1 and inf of A and + * x, and norm inf of b - A x. Display residual checks. + */ + HPL_pdrandmat(GRID, N, N + 1, NB, mat.dA, mat.ld, HPL_ISEED); + + Anorm1 = HPL_pdlange(GRID, HPL_NORM_1, N, N, NB, mat.dA, mat.ld); + AnormI = HPL_pdlange(GRID, HPL_NORM_I, N, N, NB, mat.dA, mat.ld); + /* + * Because x is distributed in process rows, switch the norms + */ + XnormI = HPL_pdlange(GRID, HPL_NORM_1, 1, N, NB, mat.dX, 1); + Xnorm1 = HPL_pdlange(GRID, HPL_NORM_I, 1, N, NB, mat.dX, 1); + /* + * If I am in the col that owns b, (1) compute local BnormI, (2) all_reduce to + * find the max (in the col). Then (3) broadcast along the rows so that every + * process has BnormI. Note that since we use a uniform distribution in + * [-0.5,0.5] for the entries of B, it is very likely that BnormI (<=,~) 0.5. + */ + + // Bptr = Mptr( mat.A , 0, nq, mat.ld ); + size_t BptrBytes = Mmax(mat.nq, mat.ld) * sizeof(double); + Bptr = (double*)malloc(BptrBytes); + + nq = HPL_numroc(N, NB, NB, mycol, 0, npcol); + dBptr = Mptr(mat.dA, 0, nq, mat.ld); + if(mycol == HPL_indxg2p(N, NB, NB, 0, npcol)) { + if(mat.mp > 0) { + // int id = HPL_idamax( mat.mp, Bptr, 1); + // BnormI = Bptr[id]; + int id; + rocblas_idamax(handle, mat.mp, dBptr, 1, &id); + + // Note: id is in Fortran indexing + hipMemcpy( + &BnormI, dBptr + id - 1, 1 * sizeof(double), hipMemcpyDeviceToHost); + BnormI = Mabs(BnormI); + } else { + BnormI = HPL_rzero; + } + (void)HPL_all_reduce( + (void*)(&BnormI), 1, HPL_DOUBLE, HPL_MAX, GRID->col_comm); + } + (void)HPL_broadcast((void*)(&BnormI), + 1, + HPL_DOUBLE, + HPL_indxg2p(N, NB, NB, 0, npcol), + GRID->row_comm); + /* + * If I own b, compute ( b - A x ) and ( - A x ) otherwise + */ + + // rocBLAS < v4.2 has an integer overflow problem in dgemv, so + // chunk the nq columns to compute the full dgemv + const int nq_chunk = std::numeric_limits::max() / (mat.ld); + + if(mycol == HPL_indxg2p(N, NB, NB, 0, npcol)) { + const double one = 1.0; + const double mone = -1.0; + + for(int nn = 0; nn < nq; nn += nq_chunk) { + int nb = Mmin(nq - nn, nq_chunk); + rocblas_dgemv(handle, + rocblas_operation_none, + mat.mp, + nb, + &mone, + Mptr(mat.dA, 0, nn, mat.ld), + mat.ld, + Mptr(mat.dX, 0, nn, 1), + 1, + &one, + dBptr, + 1); + } + + hipMemcpy(Bptr, dBptr, mat.mp * sizeof(double), hipMemcpyDeviceToHost); + } else if(nq > 0) { + const double one = 1.0; + const double zero = 0.0; + const double mone = -1.0; + + int nb = Mmin(nq, nq_chunk); + rocblas_dgemv(handle, + rocblas_operation_none, + mat.mp, + nb, + &mone, + Mptr(mat.dA, 0, 0, mat.ld), + mat.ld, + Mptr(mat.dX, 0, 0, 1), + 1, + &zero, + dBptr, + 1); + + for(int nn = nb; nn < nq; nn += nq_chunk) { + int nb = Mmin(nq - nn, nq_chunk); + rocblas_dgemv(handle, + rocblas_operation_none, + mat.mp, + nb, + &mone, + Mptr(mat.dA, 0, nn, mat.ld), + mat.ld, + Mptr(mat.dX, 0, nn, 1), + 1, + &one, + dBptr, + 1); + } + + hipMemcpy(Bptr, dBptr, mat.mp * sizeof(double), hipMemcpyDeviceToHost); + } else { + for(ii = 0; ii < mat.mp; ii++) Bptr[ii] = HPL_rzero; + } + /* + * Reduce the distributed residual in process column 0 + */ + if(mat.mp > 0) + (void)HPL_reduce(Bptr, mat.mp, HPL_DOUBLE, HPL_SUM, 0, GRID->row_comm); + + /* + * Compute || b - A x ||_oo + */ + hipMemcpy(dBptr, Bptr, mat.mp * sizeof(double), hipMemcpyHostToDevice); + resid0 = HPL_pdlange(GRID, HPL_NORM_I, N, 1, NB, dBptr, mat.ld); + /* + * Computes and displays norms, residuals ... + */ + if(N <= 0) { + resid1 = HPL_rzero; + } else { + resid1 = resid0 / (TEST->epsil * (AnormI * XnormI + BnormI) * (double)(N)); + } + + if(resid1 < TEST->thrsh) + (TEST->kpass)++; + else + (TEST->kfail)++; + + if((myrow == 0) && (mycol == 0)) { + HPL_fprintf(TEST->outfp, + "%s%s\n", + "----------------------------------------", + "----------------------------------------"); + HPL_fprintf(TEST->outfp, + "%s%16.7f%s%s\n", + "||Ax-b||_oo/(eps*(||A||_oo*||x||_oo+||b||_oo)*N)= ", + resid1, + " ...... ", + (resid1 < TEST->thrsh ? "PASSED" : "FAILED")); + + if(resid1 >= TEST->thrsh) { + HPL_fprintf(TEST->outfp, + "%s%18.6f\n", + "||Ax-b||_oo . . . . . . . . . . . . . . . . . = ", + resid0); + HPL_fprintf(TEST->outfp, + "%s%18.6f\n", + "||A||_oo . . . . . . . . . . . . . . . . . . . = ", + AnormI); + HPL_fprintf(TEST->outfp, + "%s%18.6f\n", + "||A||_1 . . . . . . . . . . . . . . . . . . . = ", + Anorm1); + HPL_fprintf(TEST->outfp, + "%s%18.6f\n", + "||x||_oo . . . . . . . . . . . . . . . . . . . = ", + XnormI); + HPL_fprintf(TEST->outfp, + "%s%18.6f\n", + "||x||_1 . . . . . . . . . . . . . . . . . . . = ", + Xnorm1); + HPL_fprintf(TEST->outfp, + "%s%18.6f\n", + "||b||_oo . . . . . . . . . . . . . . . . . . . = ", + BnormI); + } + +#ifdef HPL_PROGRESS_REPORT + if(resid1 < TEST->thrsh) + printf("Residual Check: PASSED \n"); + else + printf("Residual Check: FAILED \n"); +#endif + } + + if(Bptr) free(Bptr); + HPL_pdmatfree(&mat); +} diff --git a/src/auxil/HPL_abort.cpp b/src/auxil/HPL_abort.cpp new file mode 100644 index 0000000..c83dc9e --- /dev/null +++ b/src/auxil/HPL_abort.cpp @@ -0,0 +1,74 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_abort(int LINE, const char* SRNAME, const char* FORM, ...) { + /* + * Purpose + * ======= + * + * HPL_abort displays an error message on stderr and halts execution. + * + * + * Arguments + * ========= + * + * LINE (local input) int + * On entry, LINE specifies the line number in the file where + * the error has occured. When LINE is not a positive line + * number, it is ignored. + * + * SRNAME (local input) const char * + * On entry, SRNAME should be the name of the routine calling + * this error handler. + * + * FORM (local input) const char * + * On entry, FORM specifies the format, i.e., how the subsequent + * arguments are converted for output. + * + * (local input) ... + * On entry, ... is the list of arguments to be printed within + * the format string. + * + * --------------------------------------------------------------------- + */ + + va_list argptr; + char cline[128]; + + va_start(argptr, FORM); + (void)vsprintf(cline, FORM, argptr); + va_end(argptr); + /* + * Display an error message + */ + if(LINE <= 0) + HPL_fprintf(stderr, + "%s %s:\n>>> %s <<< Abort ...\n\n", + "HPL ERROR in function", + SRNAME, + cline); + else + HPL_fprintf(stderr, + "%s %d %s %s:\n>>> %s <<< Abort ...\n\n", + "HPL ERROR on line", + LINE, + "of function", + SRNAME, + cline); + exit(0); +} diff --git a/src/auxil/HPL_dlacpy.cpp b/src/auxil/HPL_dlacpy.cpp new file mode 100644 index 0000000..7216618 --- /dev/null +++ b/src/auxil/HPL_dlacpy.cpp @@ -0,0 +1,68 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_dlacpy(const int M, + const int N, + const double* A, + const int LDA, + double* B, + const int LDB) { + /* + * Purpose + * ======= + * + * HPL_dlacpy copies an array A into an array B. + * + * + * Arguments + * ========= + * + * M (local input) const int + * On entry, M specifies the number of rows of the arrays A and + * B. M must be at least zero. + * + * N (local input) const int + * On entry, N specifies the number of columns of the arrays A + * and B. N must be at least zero. + * + * A (local input) const double * + * On entry, A points to an array of dimension (LDA,N). + * + * LDA (local input) const int + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least MAX(1,M). + * + * B (local output) double * + * On entry, B points to an array of dimension (LDB,N). On exit, + * B is overwritten with A. + * + * LDB (local input) const int + * On entry, LDB specifies the leading dimension of the array B. + * LDB must be at least MAX(1,M). + * + * --------------------------------------------------------------------- + */ + /* + * .. Local Variables .. + */ + int j; + + if((M <= 0) || (N <= 0)) return; + + for(j = 0; j < N; j++, A += LDA, B += LDB) HPL_dcopy(M, A, 1, B, 1); +} diff --git a/src/auxil/HPL_dlamch.cpp b/src/auxil/HPL_dlamch.cpp new file mode 100644 index 0000000..9adc0f6 --- /dev/null +++ b/src/auxil/HPL_dlamch.cpp @@ -0,0 +1,763 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +/* + * --------------------------------------------------------------------- + * Static function prototypes + * --------------------------------------------------------------------- + */ +static void HPL_dlamc1(int*, int*, int*, int*); +static void HPL_dlamc2(int*, int*, int*, double*, int*, double*, int*, double*); +static double HPL_dlamc3(const double, const double); +static void HPL_dlamc4(int*, const double, const int); +static void HPL_dlamc5(const int, + const int, + const int, + const int, + int*, + double*); +static double HPL_dipow(const double, const int); + +double HPL_dlamch(const HPL_T_MACH CMACH) { + /* + * Purpose + * ======= + * + * HPL_dlamch determines machine-specific arithmetic constants such as + * the relative machine precision (eps), the safe minimum (sfmin) such + * that 1 / sfmin does not overflow, the base of the machine (base), the + * precision (prec), the number of (base) digits in the mantissa (t), + * whether rounding occurs in addition (rnd=1.0 and 0.0 otherwise), the + * minimum exponent before (gradual) underflow (emin), the underflow + * threshold (rmin) base**(emin-1), the largest exponent before overflow + * (emax), the overflow threshold (rmax) (base**emax)*(1-eps). + * + * Notes + * ===== + * + * This function has been manually translated from the Fortran 77 LAPACK + * auxiliary function dlamch.f (version 2.0 -- 1992), that was itself + * based on the function ENVRON by Malcolm and incorporated suggestions + * by Gentleman and Marovich. See + * + * Malcolm M. A., Algorithms to reveal properties of floating-point + * arithmetic., Comms. of the ACM, 15, 949-951 (1972). + * + * Gentleman W. M. and Marovich S. B., More on algorithms that reveal + * properties of floating point arithmetic units., Comms. of the ACM, + * 17, 276-277 (1974). + * + * Arguments + * ========= + * + * CMACH (local input) const HPL_T_MACH + * Specifies the value to be returned by HPL_dlamch + * = HPL_MACH_EPS, HPL_dlamch := eps (default) + * = HPL_MACH_SFMIN, HPL_dlamch := sfmin + * = HPL_MACH_BASE, HPL_dlamch := base + * = HPL_MACH_PREC, HPL_dlamch := eps*base + * = HPL_MACH_MLEN, HPL_dlamch := t + * = HPL_MACH_RND, HPL_dlamch := rnd + * = HPL_MACH_EMIN, HPL_dlamch := emin + * = HPL_MACH_RMIN, HPL_dlamch := rmin + * = HPL_MACH_EMAX, HPL_dlamch := emax + * = HPL_MACH_RMAX, HPL_dlamch := rmax + * + * where + * + * eps = relative machine precision, + * sfmin = safe minimum, + * base = base of the machine, + * prec = eps*base, + * t = number of digits in the mantissa, + * rnd = 1.0 if rounding occurs in addition, + * emin = minimum exponent before underflow, + * rmin = underflow threshold, + * emax = largest exponent before overflow, + * rmax = overflow threshold. + * + * --------------------------------------------------------------------- + */ + + static double eps, sfmin, base, t, rnd, emin, rmin, emax, rmax, prec; + double small; + static int first = 1; + int beta = 0, imax = 0, imin = 0, it = 0, lrnd = 0; + + if(first != 0) { + first = 0; + HPL_dlamc2(&beta, &it, &lrnd, &eps, &imin, &rmin, &imax, &rmax); + base = (double)(beta); + t = (double)(it); + if(lrnd != 0) { + rnd = HPL_rone; + eps = HPL_dipow(base, 1 - it) / HPL_rtwo; + } else { + rnd = HPL_rzero; + eps = HPL_dipow(base, 1 - it); + } + prec = eps * base; + emin = (double)(imin); + emax = (double)(imax); + sfmin = rmin; + small = HPL_rone / rmax; + /* + * Use SMALL plus a bit, to avoid the possibility of rounding causing + * overflow when computing 1/sfmin. + */ + if(small >= sfmin) sfmin = small * (HPL_rone + eps); + } + + if(CMACH == HPL_MACH_EPS) return (eps); + if(CMACH == HPL_MACH_SFMIN) return (sfmin); + if(CMACH == HPL_MACH_BASE) return (base); + if(CMACH == HPL_MACH_PREC) return (prec); + if(CMACH == HPL_MACH_MLEN) return (t); + if(CMACH == HPL_MACH_RND) return (rnd); + if(CMACH == HPL_MACH_EMIN) return (emin); + if(CMACH == HPL_MACH_RMIN) return (rmin); + if(CMACH == HPL_MACH_EMAX) return (emax); + if(CMACH == HPL_MACH_RMAX) return (rmax); + + return (eps); +} + +static void HPL_dlamc1(int* BETA, int* T, int* RND, int* IEEE1) { + /* + * Purpose + * ======= + * + * HPL_dlamc1 determines the machine parameters given by BETA, T, RND, + * and IEEE1. + * + * Notes + * ===== + * + * This function has been manually translated from the Fortran 77 LAPACK + * auxiliary function dlamc1.f (version 2.0 -- 1992), that was itself + * based on the function ENVRON by Malcolm and incorporated suggestions + * by Gentleman and Marovich. See + * + * Malcolm M. A., Algorithms to reveal properties of floating-point + * arithmetic., Comms. of the ACM, 15, 949-951 (1972). + * + * Gentleman W. M. and Marovich S. B., More on algorithms that reveal + * properties of floating point arithmetic units., Comms. of the ACM, + * 17, 276-277 (1974). + * + * Arguments + * ========= + * + * BETA (local output) int * + * The base of the machine. + * + * T (local output) int * + * The number of ( BETA ) digits in the mantissa. + * + * RND (local output) int * + * Specifies whether proper rounding (RND=1) or chopping (RND=0) + * occurs in addition. This may not be a reliable guide to the + * way in which the machine performs its arithmetic. + * + * IEEE1 (local output) int * + * Specifies whether rounding appears to be done in the IEEE + * `round to nearest' style (IEEE1=1), (IEEE1=0) otherwise. + * + * --------------------------------------------------------------------- + */ + + double a, b, c, f, one, qtr, savec, t1, t2; + static int first = 1, lbeta, lieee1, lrnd, lt; + + if(first != 0) { + first = 0; + one = HPL_rone; + /* + * lbeta, lieee1, lt and lrnd are the local values of BETA, IEEE1, T and + * RND. Throughout this routine we use the function HPL_dlamc3 to ensure + * that relevant values are stored and not held in registers, or are not + * affected by optimizers. + * + * Compute a = 2.0**m with the smallest positive integer m such that + * fl( a + 1.0 ) == a. + */ + a = HPL_rone; + c = HPL_rone; + do { + a *= HPL_rtwo; + c = HPL_dlamc3(a, one); + c = HPL_dlamc3(c, -a); + } while(c == HPL_rone); + /* + * Now compute b = 2.0**m with the smallest positive integer m such that + * fl( a + b ) > a. + */ + b = HPL_rone; + c = HPL_dlamc3(a, b); + while(c == a) { + b *= HPL_rtwo; + c = HPL_dlamc3(a, b); + } + /* + * Now compute the base. a and c are neighbouring floating point num- + * bers in the interval ( BETA**T, BETA**( T + 1 ) ) and so their diffe- + * rence is BETA. Adding 0.25 to c is to ensure that it is truncated to + * BETA and not (BETA-1). + */ + qtr = one / 4.0; + savec = c; + c = HPL_dlamc3(c, -a); + lbeta = (int)(c + qtr); + /* + * Now determine whether rounding or chopping occurs, by adding a bit + * less than BETA/2 and a bit more than BETA/2 to a. + */ + b = (double)(lbeta); + f = HPL_dlamc3(b / HPL_rtwo, -b / 100.0); + c = HPL_dlamc3(f, a); + if(c == a) { + lrnd = 1; + } else { + lrnd = 0; + } + f = HPL_dlamc3(b / HPL_rtwo, b / 100.0); + c = HPL_dlamc3(f, a); + if((lrnd != 0) && (c == a)) lrnd = 0; + /* + * Try and decide whether rounding is done in the IEEE round to nea- + * rest style. b/2 is half a unit in the last place of the two numbers + * a and savec. Furthermore, a is even, i.e. has last bit zero, and sa- + * vec is odd. Thus adding b/2 to a should not change a, but adding b/2 + * to savec should change savec. + */ + t1 = HPL_dlamc3(b / HPL_rtwo, a); + t2 = HPL_dlamc3(b / HPL_rtwo, savec); + if((t1 == a) && (t2 > savec) && (lrnd != 0)) + lieee1 = 1; + else + lieee1 = 0; + /* + * Now find the mantissa, T. It should be the integer part of log to the + * base BETA of a, however it is safer to determine T by powering. So we + * find T as the smallest positive integer for which fl( beta**t + 1.0 ) + * is equal to 1.0. + */ + lt = 0; + a = HPL_rone; + c = HPL_rone; + + do { + lt++; + a *= (double)(lbeta); + c = HPL_dlamc3(a, one); + c = HPL_dlamc3(c, -a); + } while(c == HPL_rone); + } + + *BETA = lbeta; + *T = lt; + *RND = lrnd; + *IEEE1 = lieee1; +} + +static void HPL_dlamc2(int* BETA, + int* T, + int* RND, + double* EPS, + int* EMIN, + double* RMIN, + int* EMAX, + double* RMAX) { + /* + * Purpose + * ======= + * + * HPL_dlamc2 determines the machine parameters specified in its argu- + * ment list. + * + * Notes + * ===== + * + * This function has been manually translated from the Fortran 77 LAPACK + * auxiliary function dlamc2.f (version 2.0 -- 1992), that was itself + * based on a function PARANOIA by W. Kahan of the University of Cali- + * fornia at Berkeley for the computation of the relative machine epsi- + * lon eps. + * + * Arguments + * ========= + * + * BETA (local output) int * + * The base of the machine. + * + * T (local output) int * + * The number of ( BETA ) digits in the mantissa. + * + * RND (local output) int * + * Specifies whether proper rounding (RND=1) or chopping (RND=0) + * occurs in addition. This may not be a reliable guide to the + * way in which the machine performs its arithmetic. + * + * EPS (local output) double * + * The smallest positive number such that fl( 1.0 - EPS ) < 1.0, + * where fl denotes the computed value. + * + * EMIN (local output) int * + * The minimum exponent before (gradual) underflow occurs. + * + * RMIN (local output) double * + * The smallest normalized number for the machine, given by + * BASE**( EMIN - 1 ), where BASE is the floating point value + * of BETA. + * + * EMAX (local output) int * + * The maximum exponent before overflow occurs. + * + * RMAX (local output) double * + * The largest positive number for the machine, given by + * BASE**EMAX * ( 1 - EPS ), where BASE is the floating point + * value of BETA. + * + * --------------------------------------------------------------------- + */ + + static double leps, lrmax, lrmin; + double a, b, c, half, one, rbase, sixth, small, third, two, zero; + static int first = 1, iwarn = 0, lbeta = 0, lemax, lemin, lt = 0; + int gnmin = 0, gpmin = 0, i, ieee, lieee1 = 0, lrnd = 0, ngnmin = 0, + ngpmin = 0; + + if(first != 0) { + first = 0; + zero = HPL_rzero; + one = HPL_rone; + two = HPL_rtwo; + /* + * lbeta, lt, lrnd, leps, lemin and lrmin are the local values of BETA, + * T, RND, EPS, EMIN and RMIN. + * + * Throughout this routine we use the function HPL_dlamc3 to ensure that + * relevant values are stored and not held in registers, or are not af- + * fected by optimizers. + * + * HPL_dlamc1 returns the parameters lbeta, lt, lrnd and lieee1. + */ + HPL_dlamc1(&lbeta, <, &lrnd, &lieee1); + /* + * Start to find eps. + */ + b = (double)(lbeta); + a = HPL_dipow(b, -lt); + leps = a; + /* + * Try some tricks to see whether or not this is the correct EPS. + */ + b = two / 3.0; + half = one / HPL_rtwo; + sixth = HPL_dlamc3(b, -half); + third = HPL_dlamc3(sixth, sixth); + b = HPL_dlamc3(third, -half); + b = HPL_dlamc3(b, sixth); + b = Mabs(b); + if(b < leps) b = leps; + + leps = HPL_rone; + + while((leps > b) && (b > zero)) { + leps = b; + c = HPL_dlamc3(half * leps, HPL_dipow(two, 5) * HPL_dipow(leps, 2)); + c = HPL_dlamc3(half, -c); + b = HPL_dlamc3(half, c); + c = HPL_dlamc3(half, -b); + b = HPL_dlamc3(half, c); + } + if(a < leps) leps = a; + /* + * Computation of EPS complete. + * + * Now find EMIN. Let a = + or - 1, and + or - (1 + BASE**(-3)). Keep + * dividing a by BETA until (gradual) underflow occurs. This is detected + * when we cannot recover the previous a. + */ + rbase = one / (double)(lbeta); + small = one; + for(i = 0; i < 3; i++) small = HPL_dlamc3(small * rbase, zero); + a = HPL_dlamc3(one, small); + HPL_dlamc4(&ngpmin, one, lbeta); + HPL_dlamc4(&ngnmin, -one, lbeta); + HPL_dlamc4(&gpmin, a, lbeta); + HPL_dlamc4(&gnmin, -a, lbeta); + + ieee = 0; + + if((ngpmin == ngnmin) && (gpmin == gnmin)) { + if(ngpmin == gpmin) { + /* + * Non twos-complement machines, no gradual underflow; e.g., VAX ) + */ + lemin = ngpmin; + } else if((gpmin - ngpmin) == 3) { + /* + * Non twos-complement machines with gradual underflow; e.g., IEEE stan- + * dard followers + */ + lemin = ngpmin - 1 + lt; + ieee = 1; + } else { + /* + * A guess; no known machine + */ + lemin = Mmin(ngpmin, gpmin); + iwarn = 1; + } + } else if((ngpmin == gpmin) && (ngnmin == gnmin)) { + if(Mabs(ngpmin - ngnmin) == 1) { + /* + * Twos-complement machines, no gradual underflow; e.g., CYBER 205 + */ + lemin = Mmax(ngpmin, ngnmin); + } else { + /* + * A guess; no known machine + */ + lemin = Mmin(ngpmin, ngnmin); + iwarn = 1; + } + } else if((Mabs(ngpmin - ngnmin) == 1) && (gpmin == gnmin)) { + if((gpmin - Mmin(ngpmin, ngnmin)) == 3) { + /* + * Twos-complement machines with gradual underflow; no known machine + */ + lemin = Mmax(ngpmin, ngnmin) - 1 + lt; + } else { + /* + * A guess; no known machine + */ + lemin = Mmin(ngpmin, ngnmin); + iwarn = 1; + } + } else { + /* + * A guess; no known machine + */ + lemin = Mmin(ngpmin, ngnmin); + lemin = Mmin(lemin, gpmin); + lemin = Mmin(lemin, gnmin); + iwarn = 1; + } + /* + * Comment out this if block if EMIN is ok + */ + if(iwarn != 0) { + first = 1; + HPL_fprintf(stderr, + "\n %s %8d\n%s\n%s\n%s\n", + "WARNING. The value EMIN may be incorrect:- EMIN =", + lemin, + "If, after inspection, the value EMIN looks acceptable, " + "please comment ", + "out the if block as marked within the code of routine " + "HPL_dlamc2, ", + "otherwise supply EMIN explicitly."); + } + /* + * Assume IEEE arithmetic if we found denormalised numbers above, or if + * arithmetic seems to round in the IEEE style, determined in routine + * HPL_dlamc1. A true IEEE machine should have both things true; how- + * ever, faulty machines may have one or the other. + */ + if((ieee != 0) || (lieee1 != 0)) + ieee = 1; + else + ieee = 0; + /* + * Compute RMIN by successive division by BETA. We could compute RMIN + * as BASE**( EMIN - 1 ), but some machines underflow during this compu- + * tation. + */ + lrmin = HPL_rone; + for(i = 0; i < 1 - lemin; i++) lrmin = HPL_dlamc3(lrmin * rbase, zero); + /* + * Finally, call HPL_dlamc5 to compute emax and rmax. + */ + HPL_dlamc5(lbeta, lt, lemin, ieee, &lemax, &lrmax); + } + *BETA = lbeta; + *T = lt; + *RND = lrnd; + *EPS = leps; + *EMIN = lemin; + *RMIN = lrmin; + *EMAX = lemax; + *RMAX = lrmax; +} + +static double HPL_dlamc3(const double A, const double B) { + /* + * Purpose + * ======= + * + * HPL_dlamc3 is intended to force a and b to be stored prior to doing + * the addition of a and b, for use in situations where optimizers + * might hold one of these in a register. + * + * Notes + * ===== + * + * This function has been manually translated from the Fortran 77 LAPACK + * auxiliary function dlamc3.f (version 2.0 -- 1992). + * + * Arguments + * ========= + * + * A, B (local input) double + * The values a and b. + * + * --------------------------------------------------------------------- + */ + + return (A + B); +} + +static void HPL_dlamc4(int* EMIN, const double START, const int BASE) { + /* + * Purpose + * ======= + * + * HPL_dlamc4 is a service function for HPL_dlamc2. + * + * Notes + * ===== + * + * This function has been manually translated from the Fortran 77 LAPACK + * auxiliary function dlamc4.f (version 2.0 -- 1992). + * + * Arguments + * ========= + * + * EMIN (local output) int * + * The minimum exponent before (gradual) underflow, computed by + * setting A = START and dividing by BASE until the previous A + * can not be recovered. + * + * START (local input) double + * The starting point for determining EMIN. + * + * BASE (local input) int + * The base of the machine. + * + * --------------------------------------------------------------------- + */ + + double a, b1, b2, c1, c2, d1, d2, one, rbase, zero; + int i; + + a = START; + one = HPL_rone; + rbase = one / (double)(BASE); + zero = HPL_rzero; + *EMIN = 1; + b1 = HPL_dlamc3(a * rbase, zero); + c1 = c2 = d1 = d2 = a; + + do { + (*EMIN)--; + a = b1; + b1 = HPL_dlamc3(a / BASE, zero); + c1 = HPL_dlamc3(b1 * BASE, zero); + d1 = zero; + for(i = 0; i < BASE; i++) d1 = d1 + b1; + b2 = HPL_dlamc3(a * rbase, zero); + c2 = HPL_dlamc3(b2 / rbase, zero); + d2 = zero; + for(i = 0; i < BASE; i++) d2 = d2 + b2; + } while((c1 == a) && (c2 == a) && (d1 == a) && (d2 == a)); +} + +static void HPL_dlamc5(const int BETA, + const int P, + const int EMIN, + const int IEEE, + int* EMAX, + double* RMAX) { + /* + * Purpose + * ======= + * + * HPL_dlamc5 attempts to compute RMAX, the largest machine floating- + * point number, without overflow. It assumes that EMAX + abs(EMIN) sum + * approximately to a power of 2. It will fail on machines where this + * assumption does not hold, for example, the Cyber 205 (EMIN = -28625, + * EMAX = 28718). It will also fail if the value supplied for EMIN is + * too large (i.e. too close to zero), probably with overflow. + * + * Notes + * ===== + * + * This function has been manually translated from the Fortran 77 LAPACK + * auxiliary function dlamc5.f (version 2.0 -- 1992). + * + * Arguments + * ========= + * + * BETA (local input) int + * The base of floating-point arithmetic. + * + * P (local input) int + * The number of base BETA digits in the mantissa of a floating- + * point value. + * + * EMIN (local input) int + * The minimum exponent before (gradual) underflow. + * + * IEEE (local input) int + * A logical flag specifying whether or not the arithmetic sys- + * tem is thought to comply with the IEEE standard. + * + * EMAX (local output) int * + * The largest exponent before overflow. + * + * RMAX (local output) double * + * The largest machine floating-point number. + * + * --------------------------------------------------------------------- + */ + + double oldy = HPL_rzero, recbas, y, z; + int exbits = 1, expsum, i, lexp = 1, nbits, ttry, uexp; +/* .. + * .. Executable Statements .. + */ +/* + * First compute lexp and uexp, two powers of 2 that bound abs(EMIN). + * We then assume that EMAX + abs( EMIN ) will sum approximately to the + * bound that is closest to abs( EMIN ). (EMAX is the exponent of the + * required number RMAX). + */ +l_10: + ttry = (int)((unsigned int)(lexp) << 1); + if(ttry <= (-EMIN)) { + lexp = ttry; + exbits++; + goto l_10; + } + + if(lexp == -EMIN) { + uexp = lexp; + } else { + uexp = ttry; + exbits++; + } + /* + * Now -lexp is less than or equal to EMIN, and -uexp is greater than or + * equal to EMIN. exbits is the number of bits needed to store the expo- + * nent. + */ + if((uexp + EMIN) > (-lexp - EMIN)) { + expsum = (int)((unsigned int)(lexp) << 1); + } else { + expsum = (int)((unsigned int)(uexp) << 1); + } + /* + * expsum is the exponent range, approximately equal to EMAX - EMIN + 1. + */ + *EMAX = expsum + EMIN - 1; + /* + * nbits is the total number of bits needed to store a floating-point + * number. + */ + nbits = 1 + exbits + P; + + if((nbits % 2 == 1) && (BETA == 2)) { + /* + * Either there are an odd number of bits used to store a floating-point + * number, which is unlikely, or some bits are not used in the represen- + * tation of numbers, which is possible, (e.g. Cray machines) or the + * mantissa has an implicit bit, (e.g. IEEE machines, Dec Vax machines), + * which is perhaps the most likely. We have to assume the last alterna- + * tive. If this is true, then we need to reduce EMAX by one because + * there must be some way of representing zero in an implicit-bit sys- + * tem. On machines like Cray we are reducing EMAX by one unnecessarily. + */ + (*EMAX)--; + } + + if(IEEE != 0) { + /* + * Assume we are on an IEEE machine which reserves one exponent for in- + * finity and NaN. + */ + (*EMAX)--; + } + /* + * Now create RMAX, the largest machine number, which should be equal to + * (1.0 - BETA**(-P)) * BETA**EMAX . First compute 1.0-BETA**(-P), being + * careful that the result is less than 1.0. + */ + recbas = HPL_rone / (double)(BETA); + z = (double)(BETA)-HPL_rone; + y = HPL_rzero; + + for(i = 0; i < P; i++) { + z *= recbas; + if(y < HPL_rone) oldy = y; + y = HPL_dlamc3(y, z); + } + + if(y >= HPL_rone) y = oldy; + /* + * Now multiply by BETA**EMAX to get RMAX. + */ + for(i = 0; i < *EMAX; i++) y = HPL_dlamc3(y * BETA, HPL_rzero); + + *RMAX = y; +} + +static double HPL_dipow(const double X, const int N) { + /* + * Purpose + * ======= + * + * HPL_dipow computes the integer n-th power of a real scalar x. + * + * Arguments + * ========= + * + * X (local input) const double + * The real scalar x. + * + * N (local input) const int + * The integer power to raise x to. + * + * --------------------------------------------------------------------- + */ + + double r, y = HPL_rone; + int k, n; + + if(X == HPL_rzero) return (HPL_rzero); + if(N < 0) { + n = -N; + r = HPL_rone / X; + } else { + n = N; + r = X; + } + for(k = 0; k < n; k++) y *= r; + + return (y); +} diff --git a/src/auxil/HPL_dlange.cpp b/src/auxil/HPL_dlange.cpp new file mode 100644 index 0000000..9f72ed4 --- /dev/null +++ b/src/auxil/HPL_dlange.cpp @@ -0,0 +1,132 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +double HPL_dlange(const HPL_T_NORM NORM, + const int M, + const int N, + const double* A, + const int LDA) { + /* + * Purpose + * ======= + * + * HPL_dlange returns the value of the one norm, or the infinity norm, + * or the element of largest absolute value of a matrix A: + * + * max(abs(A(i,j))) when NORM = HPL_NORM_A, + * norm1(A), when NORM = HPL_NORM_1, + * normI(A), when NORM = HPL_NORM_I, + * + * where norm1 denotes the one norm of a matrix (maximum column sum) and + * normI denotes the infinity norm of a matrix (maximum row sum). Note + * that max(abs(A(i,j))) is not a matrix norm. + * + * Arguments + * ========= + * + * NORM (local input) const HPL_T_NORM + * On entry, NORM specifies the value to be returned by this + * function as described above. + * + * M (local input) const int + * On entry, M specifies the number of rows of the matrix A. + * M must be at least zero. + * + * N (local input) const int + * On entry, N specifies the number of columns of the matrix A. + * N must be at least zero. + * + * A (local input) const double * + * On entry, A points to an array of dimension (LDA,N), that + * contains the matrix A. + * + * LDA (local input) const int + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least max(1,M). + * + * --------------------------------------------------------------------- + */ + + double s, v0 = HPL_rzero, *work = NULL; + int i, j; + + if((M <= 0) || (N <= 0)) return (HPL_rzero); + + if(NORM == HPL_NORM_A) { + /* + * max( abs( A ) ) + */ + for(j = 0; j < N; j++) { + for(i = 0; i < M; i++) { + v0 = Mmax(v0, Mabs(*A)); + A++; + } + A += LDA - M; + } + } else if(NORM == HPL_NORM_1) { + /* + * Find norm_1( A ). + */ + work = (double*)malloc((size_t)(N) * sizeof(double)); + if(work == NULL) { + HPL_abort(__LINE__, "HPL_dlange", "Memory allocation failed"); + } else { + for(j = 0; j < N; j++) { + s = HPL_rzero; + for(i = 0; i < M; i++) { + s += Mabs(*A); + A++; + } + work[j] = s; + A += LDA - M; + } + /* + * Find maximum sum of columns for 1-norm + */ + v0 = work[HPL_idamax(N, work, 1)]; + v0 = Mabs(v0); + if(work) free(work); + } + } else if(NORM == HPL_NORM_I) { + /* + * Find norm_inf( A ) + */ + work = (double*)malloc((size_t)(M) * sizeof(double)); + if(work == NULL) { + HPL_abort(__LINE__, "HPL_dlange", "Memory allocation failed"); + } else { + for(i = 0; i < M; i++) { work[i] = HPL_rzero; } + + for(j = 0; j < N; j++) { + for(i = 0; i < M; i++) { + work[i] += Mabs(*A); + A++; + } + A += LDA - M; + } + /* + * Find maximum sum of rows for inf-norm + */ + v0 = work[HPL_idamax(M, work, 1)]; + v0 = Mabs(v0); + if(work) free(work); + } + } + + return (v0); +} diff --git a/src/auxil/HPL_dlaprnt.cpp b/src/auxil/HPL_dlaprnt.cpp new file mode 100644 index 0000000..ae8d7f5 --- /dev/null +++ b/src/auxil/HPL_dlaprnt.cpp @@ -0,0 +1,76 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_dlaprnt(const int M, + const int N, + double* A, + const int IA, + const int JA, + const int LDA, + const char* CMATNM) { + /* + * Purpose + * ======= + * + * HPL_dlaprnt prints to standard error an M-by-N matrix A. + * + * + * Arguments + * ========= + * + * M (local input) const int + * On entry, M specifies the number of rows of A. M must be at + * least zero. + * + * N (local input) const int + * On entry, N specifies the number of columns of A. N must be + * at least zero. + * + * A (local input) double * + * On entry, A points to an array of dimension (LDA,N). + * + * IA (local input) const int + * On entry, IA specifies the starting row index to be printed. + * + * JA (local input) const int + * On entry, JA specifies the starting column index to be + * printed. + * + * LDA (local input) const int + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least max(1,M). + * + * CMATNM (local input) const char * + * On entry, CMATNM is the name of the matrix to be printed. + * + * --------------------------------------------------------------------- + */ + + int i, j; + + for(j = 0; j < N; j++) { + for(i = 0; i < M; i++) { + HPL_fprintf(stderr, + "%s(%6d,%6d)=%30.18f\n", + CMATNM, + IA + i, + JA + j, + *(Mptr(A, i, j, LDA))); + } + } +} diff --git a/src/auxil/HPL_dlatcpy.cpp b/src/auxil/HPL_dlatcpy.cpp new file mode 100644 index 0000000..da62278 --- /dev/null +++ b/src/auxil/HPL_dlatcpy.cpp @@ -0,0 +1,68 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_dlatcpy(const int M, + const int N, + const double* A, + const int LDA, + double* B, + const int LDB) { + /* + * Purpose + * ======= + * + * HPL_dlatcpy copies the transpose of an array A into an array B. + * + * + * Arguments + * ========= + * + * M (local input) const int + * On entry, M specifies the number of rows of the array B and + * the number of columns of A. M must be at least zero. + * + * N (local input) const int + * On entry, N specifies the number of rows of the array A and + * the number of columns of B. N must be at least zero. + * + * A (local input) const double * + * On entry, A points to an array of dimension (LDA,M). + * + * LDA (local input) const int + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least MAX(1,N). + * + * B (local output) double * + * On entry, B points to an array of dimension (LDB,N). On exit, + * B is overwritten with the transpose of A. + * + * LDB (local input) const int + * On entry, LDB specifies the leading dimension of the array B. + * LDB must be at least MAX(1,M). + * + * --------------------------------------------------------------------- + */ + /* + * .. Local Variables .. + */ + int j; + + if((M <= 0) || (N <= 0)) return; + + for(j = 0; j < N; j++, B += LDB) HPL_dcopy(M, A + j, LDA, B, 1); +} diff --git a/src/auxil/HPL_dlatcpy_device.cpp b/src/auxil/HPL_dlatcpy_device.cpp new file mode 100644 index 0000000..c41caab --- /dev/null +++ b/src/auxil/HPL_dlatcpy_device.cpp @@ -0,0 +1,113 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include + +#define TILE_DIM 64 +#define BLOCK_ROWS 16 + +__global__ void dlatcpy_gpu(const int M, + const int N, + const double* __restrict__ A, + const int LDA, + double* __restrict__ B, + const int LDB) { + + __shared__ double s_tile[TILE_DIM][TILE_DIM + 1]; + + int I = blockIdx.x * TILE_DIM + threadIdx.y; + int J = blockIdx.y * TILE_DIM + threadIdx.x; + + if(J < N) { + if(I + 0 < M) + s_tile[threadIdx.y + 0][threadIdx.x] = A[((size_t)I + 0) * LDA + J]; + if(I + 16 < M) + s_tile[threadIdx.y + 16][threadIdx.x] = A[((size_t)I + 16) * LDA + J]; + if(I + 32 < M) + s_tile[threadIdx.y + 32][threadIdx.x] = A[((size_t)I + 32) * LDA + J]; + if(I + 48 < M) + s_tile[threadIdx.y + 48][threadIdx.x] = A[((size_t)I + 48) * LDA + J]; + } + + I = blockIdx.x * TILE_DIM + threadIdx.x; + J = blockIdx.y * TILE_DIM + threadIdx.y; + + __syncthreads(); + + if(I < M) { + if(J + 0 < N) + B[I + ((size_t)J + 0) * LDB] = s_tile[threadIdx.x][threadIdx.y + 0]; + if(J + 16 < N) + B[I + ((size_t)J + 16) * LDB] = s_tile[threadIdx.x][threadIdx.y + 16]; + if(J + 32 < N) + B[I + ((size_t)J + 32) * LDB] = s_tile[threadIdx.x][threadIdx.y + 32]; + if(J + 48 < N) + B[I + ((size_t)J + 48) * LDB] = s_tile[threadIdx.x][threadIdx.y + 48]; + } +} + +void HPL_dlatcpy_gpu(const int M, + const int N, + const double* A, + const int LDA, + double* B, + const int LDB) { + /* + * Purpose + * ======= + * + * HPL_dlatcpy copies the transpose of an array A into an array B. + * + * + * Arguments + * ========= + * + * M (local input) const int + * On entry, M specifies the number of rows of the array B and + * the number of columns of A. M must be at least zero. + * + * N (local input) const int + * On entry, N specifies the number of rows of the array A and + * the number of columns of B. N must be at least zero. + * + * A (local input) const double * + * On entry, A points to an array of dimension (LDA,M). + * + * LDA (local input) const int + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least MAX(1,N). + * + * B (local output) double * + * On entry, B points to an array of dimension (LDB,N). On exit, + * B is overwritten with the transpose of A. + * + * LDB (local input) const int + * On entry, LDB specifies the leading dimension of the array B. + * LDB must be at least MAX(1,M). + * + * --------------------------------------------------------------------- + */ + + if((M <= 0) || (N <= 0)) return; + + hipStream_t stream; + rocblas_get_stream(handle, &stream); + + dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM); + dim3 block_size(TILE_DIM, BLOCK_ROWS); + dlatcpy_gpu<<>>(M, N, A, LDA, B, LDB); +} diff --git a/src/auxil/HPL_fprintf.cpp b/src/auxil/HPL_fprintf.cpp new file mode 100644 index 0000000..d0ee2f0 --- /dev/null +++ b/src/auxil/HPL_fprintf.cpp @@ -0,0 +1,53 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_fprintf(FILE* STREAM, const char* FORM, ...) { + /* + * Purpose + * ======= + * + * HPL_fprintf is a wrapper around fprintf flushing the output stream. + * + * + * Arguments + * ========= + * + * STREAM (local input) FILE * + * On entry, STREAM specifies the output stream. + * + * FORM (local input) const char * + * On entry, FORM specifies the format, i.e., how the subsequent + * arguments are converted for output. + * + * (local input) ... + * On entry, ... is the list of arguments to be printed within + * the format string. + * + * --------------------------------------------------------------------- + */ + + va_list argptr; + char cline[256]; + + va_start(argptr, FORM); + (void)vsprintf(cline, FORM, argptr); + va_end(argptr); + + (void)fprintf(STREAM, "%s", cline); + (void)fflush(STREAM); +} diff --git a/src/auxil/HPL_warn.cpp b/src/auxil/HPL_warn.cpp new file mode 100644 index 0000000..d6e66ab --- /dev/null +++ b/src/auxil/HPL_warn.cpp @@ -0,0 +1,80 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_warn(FILE* STREAM, + int LINE, + const char* SRNAME, + const char* FORM, + ...) { + /* + * Purpose + * ======= + * + * HPL_warn displays an error message. + * + * + * Arguments + * ========= + * + * STREAM (local input) FILE * + * On entry, STREAM specifies the output stream. + * + * LINE (local input) int + * On entry, LINE specifies the line number in the file where + * the error has occured. When LINE is not a positive line + * number, it is ignored. + * + * SRNAME (local input) const char * + * On entry, SRNAME should be the name of the routine calling + * this error handler. + * + * FORM (local input) const char * + * On entry, FORM specifies the format, i.e., how the subsequent + * arguments are converted for output. + * + * (local input) ... + * On entry, ... is the list of arguments to be printed within + * the format string. + * + * --------------------------------------------------------------------- + */ + + va_list argptr; + char cline[128]; + + va_start(argptr, FORM); + (void)vsprintf(cline, FORM, argptr); + va_end(argptr); + /* + * Display an error message + */ + if(LINE <= 0) + HPL_fprintf(STREAM, + "%s %s:\n>>> %s <<<\n\n", + "HPL ERROR in function", + SRNAME, + cline); + else + HPL_fprintf(STREAM, + "%s %d %s %s:\n>>> %s <<<\n\n", + "HPL ERROR on line", + LINE, + "of function", + SRNAME, + cline); +} diff --git a/src/blas/HPL_daxpy.cpp b/src/blas/HPL_daxpy.cpp new file mode 100644 index 0000000..b1bc697 --- /dev/null +++ b/src/blas/HPL_daxpy.cpp @@ -0,0 +1,43 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#include "hpl.hpp" + +void HPL_daxpy_omp(const int N, + const double ALPHA, + const double* X, + const int INCX, + double* Y, + const int INCY, + const int NB, + const int II, + const int thread_rank, + const int thread_size) { + + int tile = 0; + if(tile % thread_size == thread_rank) { + const int nn = Mmin(NB - II, N); + HPL_daxpy(nn, ALPHA, X, INCX, Y, INCY); + } + ++tile; + int i = NB - II; + for(; i < N; i += NB) { + if(tile % thread_size == thread_rank) { + const int nn = Mmin(NB, N - i); + HPL_daxpy(nn, ALPHA, X + i * INCX, INCX, Y + i * INCY, INCY); + } + ++tile; + } +} diff --git a/src/blas/HPL_dgemm.cpp b/src/blas/HPL_dgemm.cpp new file mode 100644 index 0000000..1bd824f --- /dev/null +++ b/src/blas/HPL_dgemm.cpp @@ -0,0 +1,65 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#include "hpl.hpp" + +void HPL_dgemm_omp(const enum HPL_ORDER ORDER, + const enum HPL_TRANS TRANSA, + const enum HPL_TRANS TRANSB, + const int M, + const int N, + const int K, + const double ALPHA, + const double* A, + const int LDA, + const double* B, + const int LDB, + const double BETA, + double* C, + const int LDC, + const int NB, + const int II, + const int thread_rank, + const int thread_size) { + + int tile = 0; + if(tile % thread_size == thread_rank) { + const int mm = Mmin(NB - II, M); + HPL_dgemm( + ORDER, TRANSA, TRANSB, mm, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC); + } + ++tile; + int i = NB - II; + for(; i < M; i += NB) { + if(tile % thread_size == thread_rank) { + const int mm = Mmin(NB, M - i); + HPL_dgemm(ORDER, + TRANSA, + TRANSB, + mm, + N, + K, + ALPHA, + A + i, + LDA, + B, + LDB, + BETA, + C + i, + LDC); + } + ++tile; + } +} diff --git a/src/blas/HPL_dgemv.cpp b/src/blas/HPL_dgemv.cpp new file mode 100644 index 0000000..19631cb --- /dev/null +++ b/src/blas/HPL_dgemv.cpp @@ -0,0 +1,60 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#include "hpl.hpp" + +void HPL_dgemv_omp(const enum HPL_ORDER ORDER, + const enum HPL_TRANS TRANS, + const int M, + const int N, + const double ALPHA, + const double* A, + const int LDA, + const double* X, + const int INCX, + const double BETA, + double* Y, + const int INCY, + const int NB, + const int II, + const int thread_rank, + const int thread_size) { + + int tile = 0; + if(tile % thread_size == thread_rank) { + const int mm = Mmin(NB - II, M); + HPL_dgemv(ORDER, TRANS, mm, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY); + } + ++tile; + int i = NB - II; + for(; i < M; i += NB) { + if(tile % thread_size == thread_rank) { + const int mm = Mmin(NB, M - i); + HPL_dgemv(ORDER, + TRANS, + mm, + N, + ALPHA, + A + i, + LDA, + X, + INCX, + BETA, + Y + i * INCY, + INCY); + } + ++tile; + } +} diff --git a/src/blas/HPL_dger.cpp b/src/blas/HPL_dger.cpp new file mode 100644 index 0000000..23b2d69 --- /dev/null +++ b/src/blas/HPL_dger.cpp @@ -0,0 +1,47 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#include "hpl.hpp" + +void HPL_dger_omp(const enum HPL_ORDER ORDER, + const int M, + const int N, + const double ALPHA, + const double* X, + const int INCX, + double* Y, + const int INCY, + double* A, + const int LDA, + const int NB, + const int II, + const int thread_rank, + const int thread_size) { + + int tile = 0; + if(tile % thread_size == thread_rank) { + const int mm = Mmin(NB - II, M); + HPL_dger(ORDER, mm, N, ALPHA, X, INCX, Y, INCY, A, LDA); + } + ++tile; + int i = NB - II; + for(; i < M; i += NB) { + if(tile % thread_size == thread_rank) { + const int mm = Mmin(NB, M - i); + HPL_dger(ORDER, mm, N, ALPHA, X + i * INCX, INCX, Y, INCY, A + i, LDA); + } + ++tile; + } +} diff --git a/src/blas/HPL_dscal.cpp b/src/blas/HPL_dscal.cpp new file mode 100644 index 0000000..fa26fcb --- /dev/null +++ b/src/blas/HPL_dscal.cpp @@ -0,0 +1,41 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#include "hpl.hpp" + +void HPL_dscal_omp(const int N, + const double ALPHA, + double* X, + const int INCX, + const int NB, + const int II, + const int thread_rank, + const int thread_size) { + + int tile = 0; + if(tile % thread_size == thread_rank) { + const int nn = Mmin(NB - II, N); + HPL_dscal(nn, ALPHA, X, INCX); + } + ++tile; + int i = NB - II; + for(; i < N; i += NB) { + if(tile % thread_size == thread_rank) { + const int nn = Mmin(NB, N - i); + HPL_dscal(nn, ALPHA, X + i * INCX, INCX); + } + ++tile; + } +} diff --git a/src/blas/HPL_idamax.cpp b/src/blas/HPL_idamax.cpp new file mode 100644 index 0000000..b9eeb63 --- /dev/null +++ b/src/blas/HPL_idamax.cpp @@ -0,0 +1,64 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#include "hpl.hpp" + +void HPL_idamax_omp(const int N, + const double* X, + const int INCX, + const int NB, + const int II, + const int thread_rank, + const int thread_size, + int* max_index, + double* max_value) { + + max_index[thread_rank] = 0; + max_value[thread_rank] = 0.0; + + if(N < 1) return; + + int tile = 0; + if(tile % thread_size == thread_rank) { + const int nn = Mmin(NB - II, N); + max_index[thread_rank] = HPL_idamax(nn, X, INCX); + max_value[thread_rank] = X[max_index[thread_rank] * INCX]; + } + ++tile; + int i = NB - II; + for(; i < N; i += NB) { + if(tile % thread_size == thread_rank) { + const int nn = Mmin(NB, N - i); + const int idm = HPL_idamax(nn, X + i * INCX, INCX); + if(abs(X[(idm + i) * INCX]) > abs(max_value[thread_rank])) { + max_value[thread_rank] = X[(idm + i) * INCX]; + max_index[thread_rank] = idm + i; + } + } + ++tile; + } + +#pragma omp barrier + + // finish reduction + if(thread_rank == 0) { + for(int rank = 1; rank < thread_size; ++rank) { + if(abs(max_value[rank]) > abs(max_value[0])) { + max_value[0] = max_value[rank]; + max_index[0] = max_index[rank]; + } + } + } +} diff --git a/src/comm/HPL_all_reduce.cpp b/src/comm/HPL_all_reduce.cpp new file mode 100644 index 0000000..10ed5cb --- /dev/null +++ b/src/comm/HPL_all_reduce.cpp @@ -0,0 +1,59 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_all_reduce(void* BUFFER, + const int COUNT, + const HPL_T_TYPE DTYPE, + const HPL_T_OP OP, + MPI_Comm COMM) { + /* + * Purpose + * ======= + * + * HPL_all_reduce performs a global reduce operation across all + * processes of a group leaving the results on all processes. + * + * Arguments + * ========= + * + * BUFFER (local input/global output) void * + * On entry, BUFFER points to the buffer to be combined. On + * exit, this array contains the combined data and is identical + * on all processes in the group. + * + * COUNT (global input) const int + * On entry, COUNT indicates the number of entries in BUFFER. + * COUNT must be at least zero. + * + * DTYPE (global input) const HPL_T_TYPE + * On entry, DTYPE specifies the type of the buffers operands. + * + * OP (global input) const HPL_T_OP + * On entry, OP is a pointer to the local combine function. + * + * COMM (global/local input) MPI_Comm + * The MPI communicator identifying the process collection. + * + * --------------------------------------------------------------------- + */ + + int ierr = MPI_Allreduce( + MPI_IN_PLACE, BUFFER, COUNT, HPL_2_MPI_TYPE(DTYPE), OP, COMM); + + return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); +} diff --git a/src/comm/HPL_all_reduce_dmxswp.cpp b/src/comm/HPL_all_reduce_dmxswp.cpp new file mode 100644 index 0000000..8c473b4 --- /dev/null +++ b/src/comm/HPL_all_reduce_dmxswp.cpp @@ -0,0 +1,298 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include + +/* MPI_Op_create is called in main to bind HPL_dmxswp to this MPI_Op */ +MPI_Op HPL_DMXSWP; +MPI_Datatype PDFACT_ROW; + +/* Swap-broadcast comparison function usable in MPI_Allreduce */ +void HPL_dmxswp(void* invec, void* inoutvec, int* len, MPI_Datatype* datatype) { + + assert(*datatype == PDFACT_ROW); + assert(*len == 1); + + int N; + MPI_Type_size(PDFACT_ROW, &N); + + double* Wwork = static_cast(invec); + double* WORK = static_cast(inoutvec); + + const int jb = (N / sizeof(double)) - 4; + + // check max column value and overwirte row if new max is found + const double gmax = Mabs(WORK[0]); + const double tmp1 = Mabs(Wwork[0]); + if((tmp1 > gmax) || ((tmp1 == gmax) && (Wwork[3] < WORK[3]))) { + HPL_dcopy(jb + 4, Wwork, 1, WORK, 1); + } +} + +void HPL_all_reduce_dmxswp(double* BUFFER, + const int COUNT, + const int ROOT, + MPI_Comm COMM, + double* WORK) { + /* + * Purpose + * ======= + * + * HPL_all_reduce_dmxswp is a specialized all_reduce that performs + * the swap-broadcast of rows. + * + * Arguments + * ========= + * + * BUFFER (local input/global output) double * + * On entry, BUFFER points to the buffer to be combined. On + * exit, this array contains the combined data and is identical + * on all processes in the group. + * + * COUNT (global input) const int + * On entry, COUNT indicates the number of entries in BUFFER. + * COUNT must be 4+2*JB, where JB is the length of the rows being + * swapped. + * + * ROOT (local input) int + * On entry, ROOT specifies the rank of the process owning the + * row to be swapped. + * + * COMM (global/local input) MPI_Comm + * The MPI communicator identifying the process collection. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least COUNT. + + * --------------------------------------------------------------------- + */ + + roctxRangePush("HPL_all_reduce_dmxswp"); + +#ifdef HPL_USE_COLLECTIVES + + const int myrow = static_cast(BUFFER[3]); + const int jb = (COUNT - 4) / 2; + + /* Use a normal all_reduce */ + (void)MPI_Allreduce(MPI_IN_PLACE, BUFFER, 1, PDFACT_ROW, HPL_DMXSWP, COMM); + + /*Location of max row*/ + const int maxrow = static_cast(BUFFER[3]); + + if(myrow == ROOT) { /*Root send top row to maxrow*/ + if(maxrow != ROOT) { + double* Wwork = BUFFER + 4 + jb; + HPL_send(Wwork, jb, maxrow, MSGID_BEGIN_PFACT, COMM); + } + } else if(myrow == maxrow) { /*Recv top row from ROOT*/ + double* Wwork = BUFFER + 4 + jb; + HPL_recv(Wwork, jb, ROOT, MSGID_BEGIN_PFACT, COMM); + } + +#else + + double gmax, tmp1; + double * A0, *Wmx; + unsigned int hdim, ip2, ip2_, ipow, k, mask; + int Np2, cnt_, cnt0, i, icurrow, mydist, mydis_, myrow, n0, nprow, partner, + rcnt, root, scnt, size_; + + MPI_Comm_rank(COMM, &myrow); + MPI_Comm_size(COMM, &nprow); + + /* + * ip2 : largest power of two <= nprow; + * hdim : ip2 procs hypercube dim; + */ + hdim = 0; + ip2 = 1; + k = nprow; + while(k > 1) { + k >>= 1; + ip2 <<= 1; + hdim++; + } + + n0 = (COUNT - 4) / 2; + icurrow = ROOT; + Np2 = (int)((size_ = nprow - ip2) != 0); + mydist = MModSub(myrow, icurrow, nprow); + + /* + * Set up pointers in workspace: WORK and Wwork point to the beginning + * of the buffers of size 4 + 2*N0 to be combined. Wmx points to the row + * owning the local (before combine) and global (after combine) absolute + * value max. A0 points to the copy of the current row of the matrix. + */ + + cnt0 = (cnt_ = n0 + 4) + n0; + A0 = (Wmx = BUFFER + 4) + n0; + + /* + * Combine the results (bi-directional exchange): the process coordina- + * tes are relative to icurrow, this allows to reduce the communication + * volume when nprow is not a power of 2. + * + * When nprow is not a power of 2: proc[i-ip2] receives local data from + * proc[i] for all i in [ip2..nprow). In addition, proc[0] (icurrow) + * sends to proc[ip2] the current row of A for later broadcast in procs + * [ip2..nprow). + */ + if((Np2 != 0) && ((partner = (int)((unsigned int)(mydist) ^ ip2)) < nprow)) { + if((mydist & ip2) != 0) { + if(mydist == (int)(ip2)) + (void)HPL_sdrv(BUFFER, + cnt_, + MSGID_BEGIN_PFACT, + A0, + n0, + MSGID_BEGIN_PFACT, + MModAdd(partner, icurrow, nprow), + COMM); + else + (void)HPL_send(BUFFER, + cnt_, + MModAdd(partner, icurrow, nprow), + MSGID_BEGIN_PFACT, + COMM); + } else { + if(mydist == 0) + (void)HPL_sdrv(A0, + n0, + MSGID_BEGIN_PFACT, + WORK, + cnt_, + MSGID_BEGIN_PFACT, + MModAdd(partner, icurrow, nprow), + COMM); + else + (void)HPL_recv(WORK, + cnt_, + MModAdd(partner, icurrow, nprow), + MSGID_BEGIN_PFACT, + COMM); + + tmp1 = Mabs(WORK[0]); + gmax = Mabs(BUFFER[0]); + if((tmp1 > gmax) || ((tmp1 == gmax) && (WORK[3] < BUFFER[3]))) { + HPL_dcopy(cnt_, WORK, 1, BUFFER, 1); + } + } + } + + if(mydist < (int)(ip2)) { + /* + * power of 2 part of the processes collection: processes [0..ip2) are + * combining (binary exchange); proc[0] has two rows to send, but one to + * receive. At every step k in [0..hdim) of the algorithm, a process + * pair exchanging 2 rows is such that myrow >> k+1 is 0. Among those + * processes the ones that are sending one more row than what they are + * receiving are such that myrow >> k is equal to 0. + */ + k = 0; + ipow = 1; + + while(k < hdim) { + if(((unsigned int)(mydist) >> (k + 1)) == 0) { + if(((unsigned int)(mydist) >> k) == 0) { + scnt = cnt0; + rcnt = cnt_; + } else { + scnt = cnt_; + rcnt = cnt0; + } + } else { + scnt = rcnt = cnt_; + } + + partner = (int)((unsigned int)(mydist) ^ ipow); + (void)HPL_sdrv(BUFFER, + scnt, + MSGID_BEGIN_PFACT, + WORK, + rcnt, + MSGID_BEGIN_PFACT, + MModAdd(partner, icurrow, nprow), + COMM); + + tmp1 = Mabs(WORK[0]); + gmax = Mabs(BUFFER[0]); + if((tmp1 > gmax) || ((tmp1 == gmax) && (WORK[3] < BUFFER[3]))) { + HPL_dcopy((rcnt == cnt0 ? cnt0 : cnt_), WORK, 1, BUFFER, 1); + } else if(rcnt == cnt0) { + HPL_dcopy(n0, WORK + cnt_, 1, A0, 1); + } + + ipow <<= 1; + k++; + } + } else if(size_ > 1) { + /* + * proc[ip2] broadcast current row of A to procs [ip2+1..nprow). + */ + k = (unsigned int)(size_)-1; + ip2_ = mask = 1; + while(k > 1) { + k >>= 1; + ip2_ <<= 1; + mask <<= 1; + mask++; + } + + root = MModAdd(icurrow, (int)(ip2), nprow); + mydis_ = MModSub(myrow, root, nprow); + + do { + mask ^= ip2_; + if((mydis_ & mask) == 0) { + partner = (int)(mydis_ ^ ip2_); + if((mydis_ & ip2_) != 0) { + (void)HPL_recv( + A0, n0, MModAdd(root, partner, nprow), MSGID_BEGIN_PFACT, COMM); + } else if(partner < size_) { + (void)HPL_send( + A0, n0, MModAdd(root, partner, nprow), MSGID_BEGIN_PFACT, COMM); + } + } + ip2_ >>= 1; + } while(ip2_ > 0); + } + /* + * If nprow is not a power of 2, for all i in [ip2..nprow), proc[i-ip2] + * sends the pivot row to proc[i] along with the first four entries of + * the BUFFER array. + */ + if((Np2 != 0) && ((partner = (int)((unsigned int)(mydist) ^ ip2)) < nprow)) { + if((mydist & ip2) != 0) { + (void)HPL_recv(BUFFER, + cnt_, + MModAdd(partner, icurrow, nprow), + MSGID_BEGIN_PFACT, + COMM); + } else { + (void)HPL_send(BUFFER, + cnt_, + MModAdd(partner, icurrow, nprow), + MSGID_BEGIN_PFACT, + COMM); + } + } + +#endif + roctxRangePop(); +} diff --git a/src/comm/HPL_allgatherv.cpp b/src/comm/HPL_allgatherv.cpp new file mode 100644 index 0000000..17f0bad --- /dev/null +++ b/src/comm/HPL_allgatherv.cpp @@ -0,0 +1,128 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_allgatherv(double* BUF, + const int SCOUNT, + const int* RCOUNT, + const int* DISPL, + MPI_Comm COMM) { + /* + * Purpose + * ======= + * + * HPL_allgatherv is a simple wrapper around an in-place MPI_Allgatherv. + * Its main purpose is to allow for some experimentation / tuning + * of this simple routine. Successful completion is indicated by + * the returned error code HPL_SUCCESS. + * + * Arguments + * ========= + * + * BUF (local input/output) double * + * On entry, on the root process BUF specifies the starting + * address of buffer to be gathered. + * + * SCOUNT (local input) int + * On entry, SCOUNT is an array of length SIZE specifiying + * the number of double precision entries in BUF to send to + * each process. + * + * RCOUNT (local input) int + * On entry, RCOUNT is an array of length SIZE specifiying + * the number of double precision entries in BUF to receive from + * each process. + * + * DISPL (local input) int * + * On entry, DISPL is an array of length SIZE specifiying the + * displacement (relative to BUF) from which to place the incoming + * data from each process. + * + * COMM (local input) MPI_Comm + * The MPI communicator identifying the communication space. + * + * --------------------------------------------------------------------- + */ + + roctxRangePush("HPL_Allgatherv"); + +#ifdef HPL_USE_COLLECTIVES + + int ierr = MPI_Allgatherv( + MPI_IN_PLACE, SCOUNT, MPI_DOUBLE, BUF, RCOUNT, DISPL, MPI_DOUBLE, COMM); + +#else + + int rank, size, ierr = MPI_SUCCESS; + MPI_Comm_rank(COMM, &rank); + MPI_Comm_size(COMM, &size); + + /* + * Ring exchange + */ + const int npm1 = size - 1; + const int prev = MModSub1(rank, size); + const int next = MModAdd1(rank, size); + + const int tag = 0; + + for(int k = 0; k < npm1; k++) { + MPI_Request request; + MPI_Status status; + const int l = (int)((unsigned int)(k) >> 1); + + int il, lengthS, lengthR, partner, ibufS, ibufR; + if(((rank + k) & 1) != 0) { + il = MModAdd(rank, l, size); + ibufS = DISPL[il]; + lengthS = RCOUNT[il]; + il = MModSub(rank, l + 1, size); + ibufR = DISPL[il]; + lengthR = RCOUNT[il]; + partner = prev; + } else { + il = MModSub(rank, l, size); + ibufS = DISPL[il]; + lengthS = RCOUNT[il]; + il = MModAdd(rank, l + 1, size); + ibufR = DISPL[il]; + lengthR = RCOUNT[il]; + partner = next; + } + + if(lengthR > 0) { + if(ierr == MPI_SUCCESS) + ierr = MPI_Irecv( + BUF + ibufR, lengthR, MPI_DOUBLE, partner, tag, COMM, &request); + } + + if(lengthS > 0) { + if(ierr == MPI_SUCCESS) + ierr = MPI_Send(BUF + ibufS, lengthS, MPI_DOUBLE, partner, tag, COMM); + } + + if(lengthR > 0) { + if(ierr == MPI_SUCCESS) ierr = MPI_Wait(&request, &status); + } + } + +#endif + + roctxRangePop(); + + return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); +} diff --git a/src/comm/HPL_barrier.cpp b/src/comm/HPL_barrier.cpp new file mode 100644 index 0000000..747571d --- /dev/null +++ b/src/comm/HPL_barrier.cpp @@ -0,0 +1,40 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_barrier(MPI_Comm COMM) { + /* + * Purpose + * ======= + * + * HPL_barrier blocks the caller until all process members have call it. + * The call returns at any process only after all group members have + * entered the call. + * + * Arguments + * ========= + * + * COMM (global/local input) MPI_Comm + * The MPI communicator identifying the process collection. + * + * --------------------------------------------------------------------- + */ + + int ierr = MPI_Barrier(COMM); + + return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); +} diff --git a/src/comm/HPL_bcast.cpp b/src/comm/HPL_bcast.cpp new file mode 100644 index 0000000..7cfce11 --- /dev/null +++ b/src/comm/HPL_bcast.cpp @@ -0,0 +1,82 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_bcast(double* SBUF, + int SCOUNT, + int ROOT, + MPI_Comm COMM, + HPL_T_TOP top) { + /* + * Purpose + * ======= + * + * HPL_bcast is a simple wrapper around MPI_Bcast. Its main purpose is + * to allow for some experimentation / tuning of this simple routine. + * Successful completion is indicated by the returned error code + * HPL_SUCCESS. In the case of messages of length less than or equal to + * zero, this function returns immediately. + * + * Arguments + * ========= + * + * SBUF (local input) double * + * On entry, SBUF specifies the starting address of buffer to be + * broadcast. + * + * SCOUNT (local input) int + * On entry, SCOUNT specifies the number of double precision + * entries in SBUF. SCOUNT must be at least zero. + * + * ROOT (local input) int + * On entry, ROOT specifies the rank of the origin process in + * the communication space defined by COMM. + * + * COMM (local input) MPI_Comm + * The MPI communicator identifying the communication space. + * + * --------------------------------------------------------------------- + */ + + if(SCOUNT <= 0) return (HPL_SUCCESS); + + int ierr; + + roctxRangePush("HPL_Bcast"); + +#ifdef HPL_USE_COLLECTIVES + + ierr = MPI_Bcast(SBUF, SCOUNT, MPI_DOUBLE, ROOT, COMM); + +#else + + switch(top) { + case HPL_1RING_M: ierr = HPL_bcast_1rinM(SBUF, SCOUNT, ROOT, COMM); break; + case HPL_1RING: ierr = HPL_bcast_1ring(SBUF, SCOUNT, ROOT, COMM); break; + case HPL_2RING_M: ierr = HPL_bcast_2rinM(SBUF, SCOUNT, ROOT, COMM); break; + case HPL_2RING: ierr = HPL_bcast_2ring(SBUF, SCOUNT, ROOT, COMM); break; + case HPL_BLONG_M: ierr = HPL_bcast_blonM(SBUF, SCOUNT, ROOT, COMM); break; + case HPL_BLONG: ierr = HPL_bcast_blong(SBUF, SCOUNT, ROOT, COMM); break; + default: ierr = HPL_FAILURE; + } + +#endif + + roctxRangePop(); + + return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); +} diff --git a/src/comm/HPL_bcast_1rinM.cpp b/src/comm/HPL_bcast_1rinM.cpp new file mode 100644 index 0000000..7927d5a --- /dev/null +++ b/src/comm/HPL_bcast_1rinM.cpp @@ -0,0 +1,109 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_bcast_1rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) { + + int rank, size; + MPI_Comm_rank(COMM, &rank); + MPI_Comm_size(COMM, &size); + + if(size <= 1) return (MPI_SUCCESS); + + /*Root immediately sends to ROOT+1*/ + if(rank == ROOT) { + MPI_Send(SBUF, SCOUNT, MPI_DOUBLE, MModAdd1(ROOT, size), ROOT, COMM); + } else if(rank == MModAdd1(ROOT, size)) { + MPI_Recv(SBUF, SCOUNT, MPI_DOUBLE, ROOT, ROOT, COMM, MPI_STATUS_IGNORE); + return MPI_SUCCESS; + } + + if(size == 2) return (MPI_SUCCESS); + + /*One ring exchange to rule them all*/ + int chunk_size = 512 * 512; // 2MB + + chunk_size = std::min(chunk_size, SCOUNT); + + MPI_Request request[2]; + + request[0] = MPI_REQUEST_NULL; + request[1] = MPI_REQUEST_NULL; + + const int Nchunks = (SCOUNT + chunk_size - 1) / chunk_size; + + const int tag = rank; + const int next = + (rank == ROOT) ? MModAdd(ROOT, 2, size) : MModAdd1(rank, size); + const int prev = + (rank == MModAdd(ROOT, 2, size)) ? ROOT : MModSub1(rank, size); + + double* RBUF = SBUF; + + /*Shift to ROOT=0*/ + rank = MModSub(rank, ROOT, size); + + int Nsend = (rank == size - 1) ? 0 : SCOUNT; + int Nrecv = (rank == 0) ? 0 : SCOUNT; + + /*Recv from left*/ + int Nr = std::min(Nrecv, chunk_size); + if(Nr > 0) { MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0); } + + /*Send to right if there is data present to send*/ + int Ns = std::min(Nsend - Nrecv, chunk_size); + if(Ns > 0) { MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); } + + while(Nsend > 0 || Nrecv > 0) { + int index = -1; + MPI_Waitany(2, request, &index, MPI_STATUSES_IGNORE); + + if(index == 0) { /*Recv'd from left*/ + /*If we're waiting on this recv in order to send, send now*/ + if(Nrecv == Nsend) { + Ns = Nr; + MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); + } + + /*Count the recv'd amounts*/ + Nrecv -= Nr; + RBUF += Nr; + + /*Post next recv if needed*/ + Nr = std::min(Nrecv, chunk_size); + if(Nr > 0) { + MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0); + } else { + request[0] = MPI_REQUEST_NULL; + } + + } else if(index == 1) { /*Sent to right */ + Nsend -= Ns; + SBUF += Ns; + + /*Send to right if there is data present to send*/ + Ns = std::min(Nsend - Nrecv, chunk_size); + if(Ns > 0) { + MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); + } else { + request[1] = MPI_REQUEST_NULL; + } + } + } + + return MPI_SUCCESS; +} diff --git a/src/comm/HPL_bcast_1ring.cpp b/src/comm/HPL_bcast_1ring.cpp new file mode 100644 index 0000000..4a1854f --- /dev/null +++ b/src/comm/HPL_bcast_1ring.cpp @@ -0,0 +1,99 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_bcast_1ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) { + + int rank, size; + MPI_Comm_rank(COMM, &rank); + MPI_Comm_size(COMM, &size); + + if(size <= 1) return (MPI_SUCCESS); + + /*One ring exchange to rule them all*/ + int chunk_size = 512 * 512; // 2MB + // int chunk_size = 64 * 512; // 256KB + + chunk_size = std::min(chunk_size, SCOUNT); + + MPI_Request request[2]; + + request[0] = MPI_REQUEST_NULL; + request[1] = MPI_REQUEST_NULL; + + const int Nchunks = (SCOUNT + chunk_size - 1) / chunk_size; + + const int tag = rank; + const int next = MModAdd1(rank, size); + const int prev = MModSub1(rank, size); + + /*Mid point of message*/ + double* RBUF = SBUF; + + /*Shift to ROOT=0*/ + rank = MModSub(rank, ROOT, size); + + int Nsend = (rank == size - 1) ? 0 : SCOUNT; + int Nrecv = (rank == 0) ? 0 : SCOUNT; + + /*Recv from left*/ + int Nr = std::min(Nrecv, chunk_size); + if(Nr > 0) { MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0); } + + /*Send to right if there is data present to send*/ + int Ns = std::min(Nsend - Nrecv, chunk_size); + if(Ns > 0) { MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); } + + while(Nsend > 0 || Nrecv > 0) { + int index = -1; + MPI_Waitany(2, request, &index, MPI_STATUSES_IGNORE); + + if(index == 0) { /*Recv'd from left*/ + /*If we're waiting on this recv in order to send, send now*/ + if(Nrecv == Nsend) { + Ns = Nr; + MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); + } + + /*Count the recv'd amounts*/ + Nrecv -= Nr; + RBUF += Nr; + + /*Post next recv if needed*/ + Nr = std::min(Nrecv, chunk_size); + if(Nr > 0) { + MPI_Irecv(RBUF, Nr, MPI_DOUBLE, prev, prev, COMM, request + 0); + } else { + request[0] = MPI_REQUEST_NULL; + } + + } else if(index == 1) { /*Sent to right */ + Nsend -= Ns; + SBUF += Ns; + + /*Send to right if there is data present to send*/ + Ns = std::min(Nsend - Nrecv, chunk_size); + if(Ns > 0) { + MPI_Isend(SBUF, Ns, MPI_DOUBLE, next, tag, COMM, request + 1); + } else { + request[1] = MPI_REQUEST_NULL; + } + } + } + + return MPI_SUCCESS; +} diff --git a/src/comm/HPL_bcast_2rinM.cpp b/src/comm/HPL_bcast_2rinM.cpp new file mode 100644 index 0000000..cea56cf --- /dev/null +++ b/src/comm/HPL_bcast_2rinM.cpp @@ -0,0 +1,165 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_bcast_2rinM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) { + + int rank, size; + MPI_Comm_rank(COMM, &rank); + MPI_Comm_size(COMM, &size); + + if(size <= 1) return (MPI_SUCCESS); + + /*Root immediately sends to ROOT+1*/ + if(rank == ROOT) { + MPI_Send(SBUF, SCOUNT, MPI_DOUBLE, MModAdd1(ROOT, size), ROOT, COMM); + } else if(rank == MModAdd1(ROOT, size)) { + MPI_Recv(SBUF, SCOUNT, MPI_DOUBLE, ROOT, ROOT, COMM, MPI_STATUS_IGNORE); + return MPI_SUCCESS; + } + + if(size == 2) return (MPI_SUCCESS); + + /*One ring exchange to rule them all*/ + int chunk_size = 512 * 512; // 2MB + + chunk_size = std::min(chunk_size, SCOUNT); + + MPI_Request request[4]; + + request[0] = MPI_REQUEST_NULL; + request[1] = MPI_REQUEST_NULL; + request[2] = MPI_REQUEST_NULL; + request[3] = MPI_REQUEST_NULL; + + const int Nchunks = (SCOUNT + chunk_size - 1) / chunk_size; + const int NchunksHalf = (Nchunks + 1) / 2; + + const int tag = rank; + const int next = + (rank == ROOT) ? MModAdd(ROOT, 2, size) : MModAdd1(rank, size); + const int prev = + (rank == MModAdd(ROOT, 2, size)) ? ROOT : MModSub1(rank, size); + + /*Mid point of message*/ + double* SBUF0 = SBUF; + double* SBUF1 = SBUF + NchunksHalf * chunk_size; + + double* RBUF0 = SBUF0; + double* RBUF1 = SBUF1; + + /*Shift to ROOT=0*/ + rank = MModSub(rank, ROOT, size); + + int Nsend0 = (rank == size - 1) ? 0 : NchunksHalf * chunk_size; + int Nsend1 = (rank == 2) ? 0 : SCOUNT - NchunksHalf * chunk_size; + + int Nrecv0 = (rank == 0) ? 0 : NchunksHalf * chunk_size; + int Nrecv1 = (rank == 0) ? 0 : SCOUNT - NchunksHalf * chunk_size; + + /*Recv from left*/ + int Nr0 = std::min(Nrecv0, chunk_size); + if(Nr0 > 0) { + MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0); + } + + /*Recv from right*/ + int Nr1 = std::min(Nrecv1, chunk_size); + if(Nr1 > 0) { + MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1); + } + + /*Send to right if there is data present to send*/ + int Ns0 = std::min(Nsend0 - Nrecv0, chunk_size); + if(Ns0 > 0) { + MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2); + } + + /*Send to left if there is data present to send*/ + int Ns1 = std::min(Nsend1 - Nrecv1, chunk_size); + if(Ns1 > 0) { + MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3); + } + + while(Nsend0 > 0 || Nsend1 > 0 || Nrecv0 > 0 || Nrecv1 > 0) { + int index = -1; + MPI_Waitany(4, request, &index, MPI_STATUSES_IGNORE); + + if(index == 0) { /*Recv'd from left*/ + /*If we're waiting on this recv in order to send, send now*/ + if(Nrecv0 == Nsend0) { + Ns0 = Nr0; + MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2); + } + + /*Count the recv'd amounts*/ + Nrecv0 -= Nr0; + RBUF0 += Nr0; + + /*Post next recv if needed*/ + Nr0 = std::min(Nrecv0, chunk_size); + if(Nr0 > 0) { + MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0); + } else { + request[0] = MPI_REQUEST_NULL; + } + + } else if(index == 1) { /*Recv'd from right*/ + /*If we're waiting on this recv in order to send, send now*/ + if(Nrecv1 == Nsend1) { + Ns1 = Nr1; + MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3); + } + + /*Count the recv'd amounts*/ + Nrecv1 -= Nr1; + RBUF1 += Nr1; + + /*Post next recv if needed*/ + Nr1 = std::min(Nrecv1, chunk_size); + if(Nr1 > 0) { + MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1); + } else { + request[1] = MPI_REQUEST_NULL; + } + + } else if(index == 2) { /*Sent to right */ + Nsend0 -= Ns0; + SBUF0 += Ns0; + + /*Send to right if there is data present to send*/ + Ns0 = std::min(Nsend0 - Nrecv0, chunk_size); + if(Ns0 > 0) { + MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2); + } else { + request[2] = MPI_REQUEST_NULL; + } + } else { /*index==3, Sent to left */ + Nsend1 -= Ns1; + SBUF1 += Ns1; + + Ns1 = std::min(Nsend1 - Nrecv1, chunk_size); + if(Ns1 > 0) { + MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3); + } else { + request[3] = MPI_REQUEST_NULL; + } + } + } + + return MPI_SUCCESS; +} diff --git a/src/comm/HPL_bcast_2ring.cpp b/src/comm/HPL_bcast_2ring.cpp new file mode 100644 index 0000000..d6c36dd --- /dev/null +++ b/src/comm/HPL_bcast_2ring.cpp @@ -0,0 +1,153 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_bcast_2ring(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) { + + int rank, size; + MPI_Comm_rank(COMM, &rank); + MPI_Comm_size(COMM, &size); + + if(size <= 1) return (MPI_SUCCESS); + + /*One ring exchange to rule them all*/ + int chunk_size = 512 * 512; // 2MB + + chunk_size = std::min(chunk_size, SCOUNT); + + MPI_Request request[4]; + + request[0] = MPI_REQUEST_NULL; + request[1] = MPI_REQUEST_NULL; + request[2] = MPI_REQUEST_NULL; + request[3] = MPI_REQUEST_NULL; + + const int Nchunks = (SCOUNT + chunk_size - 1) / chunk_size; + const int NchunksHalf = (Nchunks + 1) / 2; + + const int tag = rank; + const int next = MModAdd1(rank, size); + const int prev = MModSub1(rank, size); + + /*Mid point of message*/ + double* SBUF0 = SBUF; + double* SBUF1 = SBUF + NchunksHalf * chunk_size; + + double* RBUF0 = SBUF0; + double* RBUF1 = SBUF1; + + /*Shift to ROOT=0*/ + rank = MModSub(rank, ROOT, size); + + int Nsend0 = (rank == size - 1) ? 0 : NchunksHalf * chunk_size; + int Nsend1 = (rank == 1) ? 0 : SCOUNT - NchunksHalf * chunk_size; + + int Nrecv0 = (rank == 0) ? 0 : NchunksHalf * chunk_size; + int Nrecv1 = (rank == 0) ? 0 : SCOUNT - NchunksHalf * chunk_size; + + /*Recv from left*/ + int Nr0 = std::min(Nrecv0, chunk_size); + if(Nr0 > 0) { + MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0); + } + + /*Recv from right*/ + int Nr1 = std::min(Nrecv1, chunk_size); + if(Nr1 > 0) { + MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1); + } + + /*Send to right if there is data present to send*/ + int Ns0 = std::min(Nsend0 - Nrecv0, chunk_size); + if(Ns0 > 0) { + MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2); + } + + /*Send to left if there is data present to send*/ + int Ns1 = std::min(Nsend1 - Nrecv1, chunk_size); + if(Ns1 > 0) { + MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3); + } + + while(Nsend0 > 0 || Nsend1 > 0 || Nrecv0 > 0 || Nrecv1 > 0) { + int index = -1; + MPI_Waitany(4, request, &index, MPI_STATUSES_IGNORE); + + if(index == 0) { /*Recv'd from left*/ + /*If we're waiting on this recv in order to send, send now*/ + if(Nrecv0 == Nsend0) { + Ns0 = Nr0; + MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2); + } + + /*Count the recv'd amounts*/ + Nrecv0 -= Nr0; + RBUF0 += Nr0; + + /*Post next recv if needed*/ + Nr0 = std::min(Nrecv0, chunk_size); + if(Nr0 > 0) { + MPI_Irecv(RBUF0, Nr0, MPI_DOUBLE, prev, prev, COMM, request + 0); + } else { + request[0] = MPI_REQUEST_NULL; + } + + } else if(index == 1) { /*Recv'd from right*/ + /*If we're waiting on this recv in order to send, send now*/ + if(Nrecv1 == Nsend1) { + Ns1 = Nr1; + MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3); + } + + /*Count the recv'd amounts*/ + Nrecv1 -= Nr1; + RBUF1 += Nr1; + + /*Post next recv if needed*/ + Nr1 = std::min(Nrecv1, chunk_size); + if(Nr1 > 0) { + MPI_Irecv(RBUF1, Nr1, MPI_DOUBLE, next, next, COMM, request + 1); + } else { + request[1] = MPI_REQUEST_NULL; + } + + } else if(index == 2) { /*Sent to right */ + Nsend0 -= Ns0; + SBUF0 += Ns0; + + /*Send to right if there is data present to send*/ + Ns0 = std::min(Nsend0 - Nrecv0, chunk_size); + if(Ns0 > 0) { + MPI_Isend(SBUF0, Ns0, MPI_DOUBLE, next, tag, COMM, request + 2); + } else { + request[2] = MPI_REQUEST_NULL; + } + } else { /*index==3, Sent to left */ + Nsend1 -= Ns1; + SBUF1 += Ns1; + + Ns1 = std::min(Nsend1 - Nrecv1, chunk_size); + if(Ns1 > 0) { + MPI_Isend(SBUF1, Ns1, MPI_DOUBLE, prev, tag, COMM, request + 3); + } else { + request[3] = MPI_REQUEST_NULL; + } + } + } + + return MPI_SUCCESS; +} diff --git a/src/comm/HPL_bcast_blonM.cpp b/src/comm/HPL_bcast_blonM.cpp new file mode 100644 index 0000000..fd85f13 --- /dev/null +++ b/src/comm/HPL_bcast_blonM.cpp @@ -0,0 +1,185 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_bcast_blonM(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) { + + int rank, size; + MPI_Comm_rank(COMM, &rank); + MPI_Comm_size(COMM, &size); + + if(size <= 1) return (MPI_SUCCESS); + + /* + * Cast phase: ROOT process sends to its right neighbor, then spread + * the panel on the other npcol - 2 processes. If I am not the ROOT + * process, probe for message received. If the message is there, then + * receive it. If I am just after the ROOT process, return. Otherwise, + * keep spreading on those npcol - 2 processes. Otherwise, inform the + * caller that the panel has still not been received. + */ + int count, ierr = MPI_SUCCESS, ibuf, ibufR, ibufS, indx, ip2 = 1, k, l, lbuf, + lbufR, lbufS, mask = 1, mydist, mydist2, next, npm1, npm2, partner, + prev; + + const int tag = ROOT; + next = MModAdd1(rank, size); + prev = MModSub1(rank, size); + + if(rank == ROOT) { + if(ierr == MPI_SUCCESS) + ierr = + MPI_Send(SBUF, SCOUNT, MPI_DOUBLE, MModAdd1(rank, size), tag, COMM); + } else if(prev == ROOT) { + if(ierr == MPI_SUCCESS) + ierr = MPI_Recv( + SBUF, SCOUNT, MPI_DOUBLE, ROOT, tag, COMM, MPI_STATUS_IGNORE); + } + /* + * if I am just after the ROOT, exit now. The message receive completed + * successfully, this guy is done. If there are only 2 processes in each + * row of processes, we are done as well. + */ + if((prev == ROOT) || (size == 2)) return ierr; + /* + * Otherwise, proceed with broadcast - Spread the panel across process + * columns + */ + npm2 = (npm1 = size - 1) - 1; + + k = npm2; + while(k > 1) { + k >>= 1; + ip2 <<= 1; + mask <<= 1; + mask++; + } + if(rank == ROOT) + mydist2 = (mydist = 0); + else + mydist2 = (mydist = MModSub(rank, ROOT, size) - 1); + + indx = ip2; + count = SCOUNT / npm1; + count = Mmax(count, 1); + + do { + mask ^= ip2; + + if((mydist & mask) == 0) { + lbuf = SCOUNT - (ibuf = indx * count); + if(indx + ip2 < npm1) { + l = ip2 * count; + lbuf = Mmin(lbuf, l); + } + + partner = mydist ^ ip2; + + if((mydist & ip2) != 0) { + partner = MModAdd(ROOT, partner, size); + if(partner != ROOT) partner = MModAdd1(partner, size); + + if(lbuf > 0) { + if(ierr == MPI_SUCCESS) + ierr = MPI_Recv(SBUF + ibuf, + lbuf, + MPI_DOUBLE, + partner, + tag, + COMM, + MPI_STATUS_IGNORE); + } + } else if(partner < npm1) { + partner = MModAdd(ROOT, partner, size); + if(partner != ROOT) partner = MModAdd1(partner, size); + + if(lbuf > 0) { + if(ierr == MPI_SUCCESS) + ierr = MPI_Send(SBUF + ibuf, lbuf, MPI_DOUBLE, partner, tag, COMM); + } + } + } + + if(mydist2 < ip2) { + ip2 >>= 1; + indx -= ip2; + } else { + mydist2 -= ip2; + ip2 >>= 1; + indx += ip2; + } + + } while(ip2 > 0); + /* + * Roll the pieces + */ + if(MModSub1(prev, size) == ROOT) prev = ROOT; + if(rank == ROOT) next = MModAdd1(next, size); + + for(k = 0; k < npm2; k++) { + l = (k >> 1); + /* + * Who is sending to who and how much + */ + if(((mydist + k) & 1) != 0) { + ibufS = (indx = MModAdd(mydist, l, npm1)) * count; + lbufS = (indx == npm2 ? SCOUNT : ibufS + count); + lbufS = Mmin(SCOUNT, lbufS) - ibufS; + lbufS = Mmax(0, lbufS); + + ibufR = (indx = MModSub(mydist, l + 1, npm1)) * count; + lbufR = (indx == npm2 ? SCOUNT : ibufR + count); + lbufR = Mmin(SCOUNT, lbufR) - ibufR; + lbufR = Mmax(0, lbufR); + + partner = prev; + } else { + ibufS = (indx = MModSub(mydist, l, npm1)) * count; + lbufS = (indx == npm2 ? SCOUNT : ibufS + count); + lbufS = Mmin(SCOUNT, lbufS) - ibufS; + lbufS = Mmax(0, lbufS); + + ibufR = (indx = MModAdd(mydist, l + 1, npm1)) * count; + lbufR = (indx == npm2 ? SCOUNT : ibufR + count); + lbufR = Mmin(SCOUNT, lbufR) - ibufR; + lbufR = Mmax(0, lbufR); + + partner = next; + } + /* + * Exchange the messages + */ + MPI_Request request; + MPI_Status status; + + if(lbufR > 0) { + if(ierr == MPI_SUCCESS) + ierr = MPI_Irecv( + SBUF + ibufR, lbufR, MPI_DOUBLE, partner, tag, COMM, &request); + } + + if(lbufS > 0) { + if(ierr == MPI_SUCCESS) + ierr = MPI_Send(SBUF + ibufS, lbufS, MPI_DOUBLE, partner, tag, COMM); + } + + if(lbufR > 0) + if(ierr == MPI_SUCCESS) ierr = MPI_Wait(&request, &status); + } + + return ierr; +} diff --git a/src/comm/HPL_bcast_blong.cpp b/src/comm/HPL_bcast_blong.cpp new file mode 100644 index 0000000..9f3f81e --- /dev/null +++ b/src/comm/HPL_bcast_blong.cpp @@ -0,0 +1,161 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_bcast_blong(double* SBUF, int SCOUNT, int ROOT, MPI_Comm COMM) { + + int rank, size; + MPI_Comm_rank(COMM, &rank); + MPI_Comm_size(COMM, &size); + + if(size <= 1) return (MPI_SUCCESS); + + /* + * Cast phase: If I am the ROOT process, start spreading the panel. If + * I am not the ROOT process, test for message receive completion. If + * the message is there, then receive it, and keep spreading in a + * blocking fashion this time. Otherwise, inform the caller that the + * panel has still not been received. + */ + int count, ierr = MPI_SUCCESS, ibuf, ibufR, ibufS, indx, ip2, k, l, lbuf, + lbufR, lbufS, mask, mydist, mydist2, npm1, partner, next, prev; + + const int tag = 0; + + // ip2 : largest power of two <= size-1; + // mask : ip2 procs hypercube mask; + mask = ip2 = 1; + k = size - 1; + while(k > 1) { + k >>= 1; + ip2 <<= 1; + mask <<= 1; + mask++; + } + + npm1 = size - 1; + mydist2 = (mydist = MModSub(rank, ROOT, size)); + indx = ip2; + count = SCOUNT / size; + count = Mmax(count, 1); + /* + * Spread the panel across process columns + */ + do { + mask ^= ip2; + + if((mydist & mask) == 0) { + lbuf = SCOUNT - (ibuf = indx * count); + if(indx + ip2 < size) { + l = ip2 * count; + lbuf = Mmin(lbuf, l); + } + + partner = mydist ^ ip2; + + if((mydist & ip2) != 0) { + partner = MModAdd(ROOT, partner, size); + + if(lbuf > 0) { + if(ierr == MPI_SUCCESS) + ierr = MPI_Recv(SBUF + ibuf, + lbuf, + MPI_DOUBLE, + partner, + tag, + COMM, + MPI_STATUS_IGNORE); + } + } else if(partner < size) { + partner = MModAdd(ROOT, partner, size); + + if(lbuf > 0) { + if(ierr == MPI_SUCCESS) + ierr = MPI_Send(SBUF + ibuf, lbuf, MPI_DOUBLE, partner, tag, COMM); + } + } + } + + if(mydist2 < ip2) { + ip2 >>= 1; + indx -= ip2; + } else { + mydist2 -= ip2; + ip2 >>= 1; + indx += ip2; + } + + } while(ip2 > 0); + /* + * Roll the pieces + */ + prev = MModSub1(rank, size); + next = MModAdd1(rank, size); + + for(k = 0; k < npm1; k++) { + l = (k >> 1); + /* + * Who is sending to who and how much + */ + if(((mydist + k) & 1) != 0) { + ibufS = (indx = MModAdd(mydist, l, size)) * count; + lbufS = (indx == npm1 ? SCOUNT : ibufS + count); + lbufS = Mmin(SCOUNT, lbufS) - ibufS; + lbufS = Mmax(0, lbufS); + + ibufR = (indx = MModSub(mydist, l + 1, size)) * count; + lbufR = (indx == npm1 ? SCOUNT : ibufR + count); + lbufR = Mmin(SCOUNT, lbufR) - ibufR; + lbufR = Mmax(0, lbufR); + + partner = prev; + } else { + ibufS = (indx = MModSub(mydist, l, size)) * count; + lbufS = (indx == npm1 ? SCOUNT : ibufS + count); + lbufS = Mmin(SCOUNT, lbufS) - ibufS; + lbufS = Mmax(0, lbufS); + + ibufR = (indx = MModAdd(mydist, l + 1, size)) * count; + lbufR = (indx == npm1 ? SCOUNT : ibufR + count); + lbufR = Mmin(SCOUNT, lbufR) - ibufR; + lbufR = Mmax(0, lbufR); + + partner = next; + } + /* + * Exchange the messages + */ + MPI_Request request; + MPI_Status status; + + if(lbufR > 0) { + if(ierr == MPI_SUCCESS) + ierr = MPI_Irecv( + SBUF + ibufR, lbufR, MPI_DOUBLE, partner, tag, COMM, &request); + } + + if(lbufS > 0) { + if(ierr == MPI_SUCCESS) + ierr = MPI_Send(SBUF + ibufS, lbufS, MPI_DOUBLE, partner, tag, COMM); + } + + if(lbufR > 0) + if(ierr == MPI_SUCCESS) ierr = MPI_Wait(&request, &status); + } + + return ierr; +} diff --git a/src/comm/HPL_broadcast.cpp b/src/comm/HPL_broadcast.cpp new file mode 100644 index 0000000..e9362f5 --- /dev/null +++ b/src/comm/HPL_broadcast.cpp @@ -0,0 +1,58 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_broadcast(void* BUFFER, + const int COUNT, + const HPL_T_TYPE DTYPE, + const int ROOT, + MPI_Comm COMM) { + /* + * Purpose + * ======= + * + * HPL_broadcast broadcasts a message from the process with rank ROOT to + * all processes in the group. + * + * Arguments + * ========= + * + * BUFFER (local input/output) void * + * On entry, BUFFER points to the buffer to be broadcast. On + * exit, this array contains the broadcast data and is identical + * on all processes in the group. + * + * COUNT (global input) const int + * On entry, COUNT indicates the number of entries in BUFFER. + * COUNT must be at least zero. + * + * DTYPE (global input) const HPL_T_TYPE + * On entry, DTYPE specifies the type of the buffers operands. + * + * ROOT (global input) const int + * On entry, ROOT is the coordinate of the source process. + * + * COMM (global/local input) MPI_Comm + * The MPI communicator identifying the process collection. + * + * --------------------------------------------------------------------- + */ + + int ierr = MPI_Bcast(BUFFER, COUNT, HPL_2_MPI_TYPE(DTYPE), ROOT, COMM); + + return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); +} diff --git a/src/comm/HPL_recv.cpp b/src/comm/HPL_recv.cpp new file mode 100644 index 0000000..665f9bb --- /dev/null +++ b/src/comm/HPL_recv.cpp @@ -0,0 +1,63 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_recv(double* RBUF, int RCOUNT, int SRC, int RTAG, MPI_Comm COMM) { + /* + * Purpose + * ======= + * + * HPL_recv is a simple wrapper around MPI_Recv. Its main purpose is + * to allow for some experimentation / tuning of this simple routine. + * Successful completion is indicated by the returned error code + * HPL_SUCCESS. In the case of messages of length less than or equal to + * zero, this function returns immediately. + * + * Arguments + * ========= + * + * RBUF (local output) double * + * On entry, RBUF specifies the starting address of buffer to be + * received. + * + * RCOUNT (local input) int + * On entry, RCOUNT specifies the number of double precision + * entries in RBUF. RCOUNT must be at least zero. + * + * SRC (local input) int + * On entry, SRC specifies the rank of the sending process in + * the communication space defined by COMM. + * + * RTAG (local input) int + * On entry, STAG specifies the message tag to be used for this + * communication operation. + * + * COMM (local input) MPI_Comm + * The MPI communicator identifying the communication space. + * + * --------------------------------------------------------------------- + */ + + if(RCOUNT <= 0) return (HPL_SUCCESS); + + MPI_Status status; + + int ierr = + MPI_Recv((void*)(RBUF), RCOUNT, MPI_DOUBLE, SRC, RTAG, COMM, &status); + + return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); +} diff --git a/src/comm/HPL_reduce.cpp b/src/comm/HPL_reduce.cpp new file mode 100644 index 0000000..ab378f3 --- /dev/null +++ b/src/comm/HPL_reduce.cpp @@ -0,0 +1,74 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_reduce(void* BUFFER, + const int COUNT, + const HPL_T_TYPE DTYPE, + const HPL_T_OP OP, + const int ROOT, + MPI_Comm COMM) { + /* + * Purpose + * ======= + * + * HPL_reduce performs a global reduce operation across all processes of + * a group. Note that the input buffer is used as workarray and in all + * processes but the accumulating process corrupting the original data. + * + * Arguments + * ========= + * + * BUFFER (local input/output) void * + * On entry, BUFFER points to the buffer to be reduced. On + * exit, and in process of rank ROOT this array contains the + * reduced data. This buffer is also used as workspace during + * the operation in the other processes of the group. + * + * COUNT (global input) const int + * On entry, COUNT indicates the number of entries in BUFFER. + * COUNT must be at least zero. + * + * DTYPE (global input) const HPL_T_TYPE + * On entry, DTYPE specifies the type of the buffers operands. + * + * OP (global input) const HPL_T_OP + * On entry, OP is a pointer to the local combine function. + * + * ROOT (global input) const int + * On entry, ROOT is the coordinate of the accumulating process. + * + * COMM (global/local input) MPI_Comm + * The MPI communicator identifying the process collection. + * + * --------------------------------------------------------------------- + */ + + int ierr; + + int rank; + MPI_Comm_rank(COMM, &rank); + + if(rank == ROOT) + ierr = MPI_Reduce( + MPI_IN_PLACE, BUFFER, COUNT, HPL_2_MPI_TYPE(DTYPE), OP, ROOT, COMM); + else + ierr = + MPI_Reduce(BUFFER, NULL, COUNT, HPL_2_MPI_TYPE(DTYPE), OP, ROOT, COMM); + + return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); +} diff --git a/src/comm/HPL_scatterv.cpp b/src/comm/HPL_scatterv.cpp new file mode 100644 index 0000000..433be71 --- /dev/null +++ b/src/comm/HPL_scatterv.cpp @@ -0,0 +1,125 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_scatterv(double* BUF, + const int* SCOUNT, + const int* DISPL, + const int RCOUNT, + int ROOT, + MPI_Comm COMM) { + /* + * Purpose + * ======= + * + * HPL_scatterv is a simple wrapper around an in-place MPI_Scatterv. + * Its main purpose is to allow for some experimentation / tuning + * of this simple routine. Successful completion is indicated by + * the returned error code HPL_SUCCESS. + * + * Arguments + * ========= + * + * BUF (local input/output) double * + * On entry, on the root process BUF specifies the starting + * address of buffer to be scattered. On non-root processes, + * BUF specifies the starting point of the received buffer. + * + * SCOUNT (local input) int * + * On entry, SCOUNT is an array of length SIZE specifiying + * the number of double precision entries in BUF to send to + * each process. + * + * DISPL (local input) int * + * On entry, DISPL is an array of length SIZE specifiying the + * displacement (relative to BUF) from which to take the outgoing + * data to each process from the root process, and the displacement + * (relative to BUF) from which to receive the incoming data on + * each non-root process. + * + * RCOUNT (local input) int + * On entry, RCOUNT specifies the number of double precision + * entries in BUF to be received from the ROOT process. + * + * ROOT (local input) int + * On entry, ROOT specifies the rank of the origin process in + * the communication space defined by COMM. + * + * COMM (local input) MPI_Comm + * The MPI communicator identifying the communication space. + * + * --------------------------------------------------------------------- + */ + + int rank, ierr = MPI_SUCCESS; + MPI_Comm_rank(COMM, &rank); + + roctxRangePush("HPL_Scatterv"); + +#ifdef HPL_USE_COLLECTIVES + + if(rank == ROOT) { + ierr = MPI_Scatterv(BUF, + SCOUNT, + DISPL, + MPI_DOUBLE, + MPI_IN_PLACE, + RCOUNT, + MPI_DOUBLE, + ROOT, + COMM); + } else { + ierr = MPI_Scatterv( + NULL, SCOUNT, DISPL, MPI_DOUBLE, BUF, RCOUNT, MPI_DOUBLE, ROOT, COMM); + } + +#else + + int size; + MPI_Comm_size(COMM, &size); + + const int tag = ROOT; + if(rank == ROOT) { + MPI_Request requests[size]; + + /*Just send size-1 messages*/ + for(int i = 0; i < size; ++i) { + + requests[i] = MPI_REQUEST_NULL; + + if(i == ROOT) { continue; } + const int ibuf = DISPL[i]; + const int lbuf = SCOUNT[i]; + + if(lbuf > 0) { + (void)MPI_Isend( + BUF + ibuf, lbuf, MPI_DOUBLE, i, tag, COMM, requests + i); + } + } + + MPI_Waitall(size, requests, MPI_STATUSES_IGNORE); + } else { + if(RCOUNT > 0) + ierr = + MPI_Recv(BUF, RCOUNT, MPI_DOUBLE, ROOT, tag, COMM, MPI_STATUS_IGNORE); + } + +#endif + roctxRangePop(); + + return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); +} diff --git a/src/comm/HPL_sdrv.cpp b/src/comm/HPL_sdrv.cpp new file mode 100644 index 0000000..2fa24ec --- /dev/null +++ b/src/comm/HPL_sdrv.cpp @@ -0,0 +1,91 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_sdrv(double* SBUF, + int SCOUNT, + int STAG, + double* RBUF, + int RCOUNT, + int RTAG, + int PARTNER, + MPI_Comm COMM) { + /* + * Purpose + * ======= + * + * HPL_sdrv is a simple wrapper around MPI_Sendrecv. Its main purpose is + * to allow for some experimentation and tuning of this simple function. + * Messages of length less than or equal to zero are not sent nor + * received. Successful completion is indicated by the returned error + * code HPL_SUCCESS. + * + * Arguments + * ========= + * + * SBUF (local input) double * + * On entry, SBUF specifies the starting address of buffer to be + * sent. + * + * SCOUNT (local input) int + * On entry, SCOUNT specifies the number of double precision + * entries in SBUF. SCOUNT must be at least zero. + * + * STAG (local input) int + * On entry, STAG specifies the message tag to be used for the + * sending communication operation. + * + * RBUF (local output) double * + * On entry, RBUF specifies the starting address of buffer to be + * received. + * + * RCOUNT (local input) int + * On entry, RCOUNT specifies the number of double precision + * entries in RBUF. RCOUNT must be at least zero. + * + * RTAG (local input) int + * On entry, RTAG specifies the message tag to be used for the + * receiving communication operation. + * + * PARTNER (local input) int + * On entry, PARTNER specifies the rank of the collaborative + * process in the communication space defined by COMM. + * + * COMM (local input) MPI_Comm + * The MPI communicator identifying the communication space. + * + * --------------------------------------------------------------------- + */ + + MPI_Status status; + int ierr; + + ierr = MPI_Sendrecv(SBUF, + SCOUNT, + MPI_DOUBLE, + PARTNER, + STAG, + RBUF, + RCOUNT, + MPI_DOUBLE, + PARTNER, + RTAG, + COMM, + &status); + + return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); +} diff --git a/src/comm/HPL_send.cpp b/src/comm/HPL_send.cpp new file mode 100644 index 0000000..5dae5c7 --- /dev/null +++ b/src/comm/HPL_send.cpp @@ -0,0 +1,60 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_send(double* SBUF, int SCOUNT, int DEST, int STAG, MPI_Comm COMM) { + /* + * Purpose + * ======= + * + * HPL_send is a simple wrapper around MPI_Send. Its main purpose is + * to allow for some experimentation / tuning of this simple routine. + * Successful completion is indicated by the returned error code + * MPI_SUCCESS. In the case of messages of length less than or equal to + * zero, this function returns immediately. + * + * Arguments + * ========= + * + * SBUF (local input) double * + * On entry, SBUF specifies the starting address of buffer to be + * sent. + * + * SCOUNT (local input) int + * On entry, SCOUNT specifies the number of double precision + * entries in SBUF. SCOUNT must be at least zero. + * + * DEST (local input) int + * On entry, DEST specifies the rank of the receiving process in + * the communication space defined by COMM. + * + * STAG (local input) int + * On entry, STAG specifies the message tag to be used for this + * communication operation. + * + * COMM (local input) MPI_Comm + * The MPI communicator identifying the communication space. + * + * --------------------------------------------------------------------- + */ + + if(SCOUNT <= 0) return (HPL_SUCCESS); + + int ierr = MPI_Send((void*)(SBUF), SCOUNT, MPI_DOUBLE, DEST, STAG, COMM); + + return ((ierr == MPI_SUCCESS ? HPL_SUCCESS : HPL_FAILURE)); +} diff --git a/src/grid/HPL_grid_exit.cpp b/src/grid/HPL_grid_exit.cpp new file mode 100644 index 0000000..be94d9c --- /dev/null +++ b/src/grid/HPL_grid_exit.cpp @@ -0,0 +1,58 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_grid_exit(HPL_T_grid* GRID) { + /* + * Purpose + * ======= + * + * HPL_grid_exit marks the process grid object for deallocation. The + * returned error code MPI_SUCCESS indicates successful completion. + * Other error codes are (MPI) implementation dependent. + * + * Arguments + * ========= + * + * GRID (local input/output) HPL_T_grid * + * On entry, GRID points to the data structure containing the + * process grid to be released. + * + * --------------------------------------------------------------------- + */ + + int hplerr = MPI_SUCCESS, mpierr; + + if(GRID->all_comm != MPI_COMM_NULL) { + mpierr = MPI_Comm_free(&(GRID->row_comm)); + if(mpierr != MPI_SUCCESS) hplerr = mpierr; + mpierr = MPI_Comm_free(&(GRID->col_comm)); + if(mpierr != MPI_SUCCESS) hplerr = mpierr; + mpierr = MPI_Comm_free(&(GRID->all_comm)); + if(mpierr != MPI_SUCCESS) hplerr = mpierr; + } + + GRID->order = HPL_COLUMN_MAJOR; + + GRID->iam = GRID->myrow = GRID->mycol = -1; + GRID->nprow = GRID->npcol = GRID->nprocs = -1; + + GRID->row_ip2 = GRID->row_hdim = GRID->row_ip2m1 = GRID->row_mask = -1; + GRID->col_ip2 = GRID->col_hdim = GRID->col_ip2m1 = GRID->col_mask = -1; + + return (hplerr); +} diff --git a/src/grid/HPL_grid_info.cpp b/src/grid/HPL_grid_info.cpp new file mode 100644 index 0000000..51db2b6 --- /dev/null +++ b/src/grid/HPL_grid_info.cpp @@ -0,0 +1,66 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_grid_info(const HPL_T_grid* GRID, + int* NPROW, + int* NPCOL, + int* MYROW, + int* MYCOL) { + /* + * Purpose + * ======= + * + * HPL_grid_info returns the grid shape and the coordinates in the grid + * of the calling process. Successful completion is indicated by the + * returned error code MPI_SUCCESS. Other error codes depend on the MPI + * implementation. + * + * Arguments + * ========= + * + * GRID (local input) const HPL_T_grid * + * On entry, GRID points to the data structure containing the + * process grid information. + * + * NPROW (global output) int * + * On exit, NPROW specifies the number of process rows in the + * grid. NPROW is at least one. + * + * NPCOL (global output) int * + * On exit, NPCOL specifies the number of process columns in + * the grid. NPCOL is at least one. + * + * MYROW (global output) int * + * On exit, MYROW specifies my row process coordinate in the + * grid. MYROW is greater than or equal to zero and less than + * NPROW. + * + * MYCOL (global output) int * + * On exit, MYCOL specifies my column process coordinate in the + * grid. MYCOL is greater than or equal to zero and less than + * NPCOL. + * + * --------------------------------------------------------------------- + */ + + *NPROW = GRID->nprow; + *NPCOL = GRID->npcol; + *MYROW = GRID->myrow; + *MYCOL = GRID->mycol; + return (MPI_SUCCESS); +} diff --git a/src/grid/HPL_grid_init.cpp b/src/grid/HPL_grid_init.cpp new file mode 100644 index 0000000..ca885bf --- /dev/null +++ b/src/grid/HPL_grid_init.cpp @@ -0,0 +1,190 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_grid_init(MPI_Comm COMM, + const HPL_T_ORDER ORDER, + const int NPROW, + const int NPCOL, + const int p, + const int q, + HPL_T_grid* GRID) { + /* + * Purpose + * ======= + * + * HPL_grid_init creates a NPROW x NPCOL process grid using column- or + * row-major ordering from an initial collection of processes identified + * by an MPI communicator. Successful completion is indicated by the + * returned error code MPI_SUCCESS. Other error codes depend on the MPI + * implementation. The coordinates of processes that are not part of the + * grid are set to values outside of [0..NPROW) x [0..NPCOL). + * + * Arguments + * ========= + * + * COMM (global/local input) MPI_Comm + * On entry, COMM is the MPI communicator identifying the + * initial collection of processes out of which the grid is + * formed. + * + * ORDER (global input) const HPL_T_ORDER + * On entry, ORDER specifies how the processes should be ordered + * in the grid as follows: + * ORDER = HPL_ROW_MAJOR row-major ordering; + * ORDER = HPL_COLUMN_MAJOR column-major ordering; + * + * NPROW (global input) const int + * On entry, NPROW specifies the number of process rows in the + * grid to be created. NPROW must be at least one. + * + * NPCOL (global input) const int + * On entry, NPCOL specifies the number of process columns in + * the grid to be created. NPCOL must be at least one. + * + * GRID (local input/output) HPL_T_grid * + * On entry, GRID points to the data structure containing the + * process grid information to be initialized. + * + * --------------------------------------------------------------------- + */ + + int hdim, hplerr = MPI_SUCCESS, ierr, ip2, k, mask, mycol, myrow, nprocs, + rank, size; + int local_myrow, local_mycol; + + MPI_Comm_rank(COMM, &rank); + MPI_Comm_size(COMM, &size); + /* + * Abort if illegal process grid + */ + nprocs = NPROW * NPCOL; + if((nprocs > size) || (NPROW < 1) || (NPCOL < 1)) { + HPL_pabort(__LINE__, "HPL_grid_init", "Illegal Grid"); + } + /* + * Row- or column-major ordering of the processes + */ + int local_size = p * q; + int local_rank = rank % local_size; + int node = rank / local_size; // node number + + if(ORDER == HPL_ROW_MAJOR) { + GRID->order = HPL_ROW_MAJOR; + local_mycol = local_rank % q; + local_myrow = local_rank / q; + + int noderow = node / (NPCOL / q); + int nodecol = node % (NPCOL / q); + + myrow = noderow * p + local_myrow; + mycol = nodecol * q + local_mycol; + + myrow = rank / NPCOL; + mycol = rank - myrow * NPCOL; + } else { + GRID->order = HPL_COLUMN_MAJOR; + local_mycol = local_rank / p; + local_myrow = local_rank % p; + + int noderow = node % (NPROW / p); + int nodecol = node / (NPROW / p); + + myrow = noderow * p + local_myrow; + mycol = nodecol * q + local_mycol; + } + + GRID->iam = rank; + GRID->local_myrow = local_myrow; + GRID->local_mycol = local_mycol; + GRID->myrow = myrow; + GRID->mycol = mycol; + GRID->local_nprow = p; + GRID->local_npcol = q; + GRID->nprow = NPROW; + GRID->npcol = NPCOL; + GRID->nprocs = nprocs; + /* + * row_ip2 : largest power of two <= nprow; + * row_hdim : row_ip2 procs hypercube dim; + * row_ip2m1 : largest power of two <= nprow-1; + * row_mask : row_ip2m1 procs hypercube mask; + */ + hdim = 0; + ip2 = 1; + k = NPROW; + while(k > 1) { + k >>= 1; + ip2 <<= 1; + hdim++; + } + GRID->row_ip2 = ip2; + GRID->row_hdim = hdim; + + mask = ip2 = 1; + k = NPROW - 1; + while(k > 1) { + k >>= 1; + ip2 <<= 1; + mask <<= 1; + mask++; + } + GRID->row_ip2m1 = ip2; + GRID->row_mask = mask; + /* + * col_ip2 : largest power of two <= npcol; + * col_hdim : col_ip2 procs hypercube dim; + * col_ip2m1 : largest power of two <= npcol-1; + * col_mask : col_ip2m1 procs hypercube mask; + */ + hdim = 0; + ip2 = 1; + k = NPCOL; + while(k > 1) { + k >>= 1; + ip2 <<= 1; + hdim++; + } + GRID->col_ip2 = ip2; + GRID->col_hdim = hdim; + + mask = ip2 = 1; + k = NPCOL - 1; + while(k > 1) { + k >>= 1; + ip2 <<= 1; + mask <<= 1; + mask++; + } + GRID->col_ip2m1 = ip2; + GRID->col_mask = mask; + /* + * All communicator, leave if I am not part of this grid. Creation of the + * row- and column communicators. + */ + ierr = MPI_Comm_split( + COMM, (rank < nprocs ? 0 : MPI_UNDEFINED), rank, &(GRID->all_comm)); + if(GRID->all_comm == MPI_COMM_NULL) return (ierr); + + ierr = MPI_Comm_split(GRID->all_comm, myrow, mycol, &(GRID->row_comm)); + if(ierr != MPI_SUCCESS) hplerr = ierr; + + ierr = MPI_Comm_split(GRID->all_comm, mycol, myrow, &(GRID->col_comm)); + if(ierr != MPI_SUCCESS) hplerr = ierr; + + return (hplerr); +} diff --git a/src/matgen/HPL_pdmatgen.cpp b/src/matgen/HPL_pdmatgen.cpp new file mode 100644 index 0000000..0f4c50f --- /dev/null +++ b/src/matgen/HPL_pdmatgen.cpp @@ -0,0 +1,262 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include +#include +#include + +const int max_nthreads = 128; + +static int Malloc(HPL_T_grid* GRID, + void** ptr, + const size_t bytes, + int info[3]) { + + int mycol, myrow, npcol, nprow; + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + + unsigned long pg_size = sysconf(_SC_PAGESIZE); + int err = posix_memalign(ptr, pg_size, bytes); + + /*Check allocation is valid*/ + info[0] = (err != 0); + info[1] = myrow; + info[2] = mycol; + (void)HPL_all_reduce((void*)(info), 3, HPL_INT, HPL_MAX, GRID->all_comm); + if(info[0] != 0) { + return HPL_FAILURE; + } else { + return HPL_SUCCESS; + } +} + +static int hostMalloc(HPL_T_grid* GRID, + void** ptr, + const size_t bytes, + int info[3]) { + + int mycol, myrow, npcol, nprow; + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + + hipError_t err = hipHostMalloc(ptr, bytes); + + /*Check allocation is valid*/ + info[0] = (err != hipSuccess); + info[1] = myrow; + info[2] = mycol; + (void)HPL_all_reduce((void*)(info), 3, HPL_INT, HPL_MAX, GRID->all_comm); + if(info[0] != 0) { + return HPL_FAILURE; + } else { + return HPL_SUCCESS; + } +} + +static int deviceMalloc(HPL_T_grid* GRID, + void** ptr, + const size_t bytes, + int info[3]) { + + int mycol, myrow, npcol, nprow; + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + + hipError_t err = hipMalloc(ptr, bytes); + + /*Check allocation is valid*/ + info[0] = (err != hipSuccess); + info[1] = myrow; + info[2] = mycol; + (void)HPL_all_reduce((void*)(info), 3, HPL_INT, HPL_MAX, GRID->all_comm); + if(info[0] != 0) { + return HPL_FAILURE; + } else { + return HPL_SUCCESS; + } +} + +int HPL_pdmatgen(HPL_T_test* TEST, + HPL_T_grid* GRID, + HPL_T_palg* ALGO, + HPL_T_pmat* mat, + const int N, + const int NB) { + + int ii, ip2, im4096; + int mycol, myrow, npcol, nprow, nq, info[3]; + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + + mat->n = N; + mat->nb = NB; + mat->info = 0; + mat->mp = HPL_numroc(N, NB, NB, myrow, 0, nprow); + nq = HPL_numroc(N, NB, NB, mycol, 0, npcol); + /* + * Allocate matrix, right-hand-side, and vector solution x. [ A | b ] is + * N by N+1. One column is added in every process column for the solve. + * The result however is stored in a 1 x N vector replicated in every + * process row. In every process, A is lda * (nq+1), x is 1 * nq and the + * workspace is mp. + */ + mat->ld = Mmax(1, mat->mp); + mat->ld = ((mat->ld + 95) / 128) * 128 + 32; /*pad*/ + + mat->nq = nq + 1; + + mat->dA = nullptr; + mat->dX = nullptr; + + mat->dW = nullptr; + mat->W = nullptr; + + /* Create a rocBLAS handle */ + rocblas_create_handle(&handle); + rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host); + rocblas_initialize(); + rocblas_set_stream(handle, computeStream); + + /* + * Allocate dynamic memory + */ + + // allocate on device + size_t numbytes = ((size_t)(mat->ld) * (size_t)(mat->nq)) * sizeof(double); + +#ifdef HPL_VERBOSE_PRINT + if((myrow == 0) && (mycol == 0)) { + printf("Local matrix size = %g GBs\n", + ((double)numbytes) / (1024 * 1024 * 1024)); + } +#endif + + if(deviceMalloc(GRID, (void**)&(mat->dA), numbytes, info) != HPL_SUCCESS) { + HPL_pwarn(TEST->outfp, + __LINE__, + "HPL_pdmatgen", + "[%d,%d] %s", + info[1], + info[2], + "Device memory allocation failed for for A and b. Skip."); + return HPL_FAILURE; + } + + // seperate space for X vector + if(deviceMalloc(GRID, (void**)&(mat->dX), mat->nq * sizeof(double), info) != + HPL_SUCCESS) { + HPL_pwarn(TEST->outfp, + __LINE__, + "HPL_pdmatgen", + "[%d,%d] %s", + info[1], + info[2], + "Device memory allocation failed for for x. Skip."); + return HPL_FAILURE; + } + + int Anp; + Mnumroc(Anp, mat->n, mat->nb, mat->nb, myrow, 0, nprow); + + /*Need space for a column of panels for pdfact on CPU*/ + size_t A_hostsize = mat->ld * mat->nb * sizeof(double); + + if(hostMalloc(GRID, (void**)&(mat->A), A_hostsize, info) != HPL_SUCCESS) { + HPL_pwarn(TEST->outfp, + __LINE__, + "HPL_pdmatgen", + "[%d,%d] %s", + info[1], + info[2], + "Panel memory allocation failed. Skip."); + return HPL_FAILURE; + } + +#pragma omp parallel + { + /*First touch*/ + const int thread_rank = omp_get_thread_num(); + const int thread_size = omp_get_num_threads(); + assert(thread_size <= max_nthreads); + + for(int i = 0; i < mat->ld; i += NB) { + if((i / NB) % thread_size == thread_rank) { + const int mm = std::min(NB, mat->ld - i); + for(int k = 0; k < NB; ++k) { + for(int j = 0; j < mm; ++j) { + mat->A[j + i + static_cast(mat->ld) * k] = 0.0; + } + } + } + } + } + + size_t dworkspace_size = 0; + size_t workspace_size = 0; + + /*pdtrsv needs two vectors for B and W (and X on host) */ + dworkspace_size = Mmax(2 * Anp * sizeof(double), dworkspace_size); + workspace_size = Mmax((2 * Anp + nq) * sizeof(double), workspace_size); + + /*Scratch space for rows in pdlaswp (with extra space for padding) */ + dworkspace_size = + Mmax((nq + mat->nb + 256) * mat->nb * sizeof(double), dworkspace_size); + workspace_size = + Mmax((nq + mat->nb + 256) * mat->nb * sizeof(double), workspace_size); + + if(deviceMalloc(GRID, (void**)&(mat->dW), dworkspace_size, info) != + HPL_SUCCESS) { + HPL_pwarn(TEST->outfp, + __LINE__, + "HPL_pdmatgen", + "[%d,%d] %s", + info[1], + info[2], + "Device memory allocation failed for U workspace. Skip."); + return HPL_FAILURE; + } + if(hostMalloc(GRID, (void**)&(mat->W), workspace_size, info) != HPL_SUCCESS) { + HPL_pwarn(TEST->outfp, + __LINE__, + "HPL_pdmatgen", + "[%d,%d] %s", + info[1], + info[2], + "Host memory allocation failed for U workspace. Skip."); + return HPL_FAILURE; + } + + return HPL_SUCCESS; +} + +void HPL_pdmatfree(HPL_T_pmat* mat) { + + if(mat->dA) { + hipFree(mat->dA); + mat->dA = nullptr; + } + if(mat->dX) { + hipFree(mat->dX); + mat->dX = nullptr; + } + if(mat->dW) { + hipFree(mat->dW); + mat->dW = nullptr; + } + + if(mat->A) { + hipHostFree(mat->A); + mat->A = nullptr; + } + if(mat->W) { + hipHostFree(mat->W); + mat->W = nullptr; + } + + rocblas_destroy_handle(handle); +} diff --git a/src/matgen/HPL_pdrandmat_device.cpp b/src/matgen/HPL_pdrandmat_device.cpp new file mode 100644 index 0000000..66b16c7 --- /dev/null +++ b/src/matgen/HPL_pdrandmat_device.cpp @@ -0,0 +1,201 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +#define BLOCK_SIZE 512 + +__global__ void hpl_randmat(const int mp, + const int nq, + const int NB, + const int LDA, + const uint64_t cblkjumpA, + const uint64_t cblkjumpC, + const uint64_t rblkjumpA, + const uint64_t rblkjumpC, + const uint64_t cjumpA, + const uint64_t cjumpC, + const uint64_t rjumpA, + const uint64_t rjumpC, + const uint64_t startrand, + double* __restrict__ A) { + + const int jblk = blockIdx.y; + const int iblk = blockIdx.x; + + /* Get panel size */ + const int jb = (jblk == gridDim.y - 1) ? nq - ((nq - 1) / NB) * NB : NB; + const int ib = (iblk == gridDim.x - 1) ? mp - ((mp - 1) / NB) * NB : NB; + + double* Ab = A + iblk * NB + static_cast(jblk * NB) * LDA; + + /* Start at first uint64_t */ + uint64_t irand = startrand; + + /* Jump rand M*NB*npcol for each jblk */ + for(int j = 0; j < jblk; ++j) { irand = cblkjumpA * irand + cblkjumpC; } + + /* Jump rand NB*nprow for each iblk */ + for(int i = 0; i < iblk; ++i) { irand = rblkjumpA * irand + rblkjumpC; } + + /* Shift per-column irand */ + const int n = threadIdx.x; + for(int j = 0; j < threadIdx.x; ++j) { irand = cjumpA * irand + cjumpC; } + + for(int n = threadIdx.x; n < jb; n += blockDim.x) { + /*Grab rand at top of block*/ + uint64_t r = irand; + + /* Each thread traverses a column */ + for(int m = 0; m < ib; ++m) { + /*Generate a random double from the current r */ + const double p1 = ((r & (65535LU << 0)) >> 0); + const double p2 = ((r & (65535LU << 16)) >> 16); + const double p3 = ((r & (65535LU << 32)) >> 32); + const double p4 = ((r & (65535LU << 48)) >> 48); + + Ab[m + n * LDA] = + (HPL_HALF - (((p1) + (p2)*HPL_POW16) / HPL_DIVFAC * HPL_HALF + (p3) + + (p4)*HPL_POW16) / + HPL_DIVFAC * HPL_HALF); + + /*Increment rand*/ + r = rjumpA * r + rjumpC; + } + + /* Block-shift per-column irand */ + for(int j = 0; j < blockDim.x; ++j) { irand = cjumpA * irand + cjumpC; } + } +} + +void HPL_pdrandmat(const HPL_T_grid* GRID, + const int M, + const int N, + const int NB, + double* A, + const int LDA, + const int ISEED) { + /* + * Purpose + * ======= + * + * HPL_pdrandmat generates (or regenerates) a parallel random matrix A. + * + * The pseudo-random generator uses the linear congruential algorithm: + * X(n+1) = (a * X(n) + c) mod m as described in the Art of Computer + * Programming, Knuth 1973, Vol. 2. + * + * Arguments + * ========= + * + * GRID (local input) const HPL_T_grid * + * On entry, GRID points to the data structure containing the + * process grid information. + * + * M (global input) const int + * On entry, M specifies the number of rows of the matrix A. + * M must be at least zero. + * + * N (global input) const int + * On entry, N specifies the number of columns of the matrix A. + * N must be at least zero. + * + * NB (global input) const int + * On entry, NB specifies the blocking factor used to partition + * and distribute the matrix A. NB must be larger than one. + * + * A (local output) double * + * On entry, A points to an array of dimension (LDA,LocQ(N)). + * On exit, this array contains the coefficients of the randomly + * generated matrix. + * + * LDA (local input) const int + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least max(1,LocP(M)). + * + * ISEED (global input) const int + * On entry, ISEED specifies the seed number to generate the + * matrix A. ISEED must be at least zero. + * + * --------------------------------------------------------------------- + */ + int mp, mycol, myrow, npcol, nprow, nq; + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + + uint64_t mult64 = HPL_MULT; + uint64_t iadd64 = HPL_IADD; + uint64_t jseed64 = static_cast(ISEED); + + /* + * Generate an M by N matrix starting in process (0,0) + */ + Mnumroc(mp, M, NB, NB, myrow, 0, nprow); + Mnumroc(nq, N, NB, NB, mycol, 0, npcol); + + if((mp <= 0) || (nq <= 0)) return; + + /* + * Compute multiplier/adder for various jumps in random sequence + */ + const int jump1 = 1; + const int jump2 = nprow * NB; + const int jump3 = M; + const int jump4 = npcol * NB; + const int jump5 = NB; + const int jump6 = mycol; + const int jump7 = myrow * NB; + + uint64_t startrand; + uint64_t rjumpA, rblkjumpA, cjumpA, cblkjumpA, ia564; + uint64_t rjumpC, rblkjumpC, cjumpC, cblkjumpC, ic564; + uint64_t itmp164, itmp264, itmp364; + + /* Compute different jump coefficients */ + HPL_xjumpm(jump1, mult64, iadd64, jseed64, startrand, rjumpA, rjumpC); + HPL_xjumpm(jump2, mult64, iadd64, startrand, itmp164, rblkjumpA, rblkjumpC); + HPL_xjumpm(jump3, mult64, iadd64, startrand, itmp164, cjumpA, cjumpC); + HPL_xjumpm(jump4, cjumpA, cjumpC, startrand, itmp164, cblkjumpA, cblkjumpC); + + /* Shift the starting random value for this rank */ + HPL_xjumpm(jump5, cjumpA, cjumpC, startrand, itmp164, ia564, ic564); + HPL_xjumpm(jump6, ia564, ic564, startrand, itmp364, itmp164, itmp264); + HPL_xjumpm(jump7, mult64, iadd64, itmp364, startrand, itmp164, itmp264); + + /* + * Local number of blocks + */ + const int mblks = (mp + NB - 1) / NB; + const int nblks = (nq + NB - 1) / NB; + + /* Initialize on GPU */ + dim3 grid = dim3(mblks, nblks); + hpl_randmat<<>>(mp, + nq, + NB, + LDA, + cblkjumpA, + cblkjumpC, + rblkjumpA, + rblkjumpC, + cjumpA, + cjumpC, + rjumpA, + rjumpC, + startrand, + A); + + hipDeviceSynchronize(); +} diff --git a/src/matgen/HPL_xjumpm.cpp b/src/matgen/HPL_xjumpm.cpp new file mode 100644 index 0000000..f1cd377 --- /dev/null +++ b/src/matgen/HPL_xjumpm.cpp @@ -0,0 +1,92 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_xjumpm(const int JUMPM, + const uint64_t MULT, + const uint64_t IADD, + const uint64_t IRANN, + uint64_t& IRANM, + uint64_t& IAM, + uint64_t& ICM) { + /* + * Purpose + * ======= + * + * HPL_xjumpm computes the constants A and C to jump JUMPM numbers in + * the random sequence: X(n+JUMPM) = A*X(n)+C. The constants encoded in + * MULT and IADD specify how to jump from one entry in the sequence to + * the next. + * + * Arguments + * ========= + * + * JUMPM (local input) const int + * On entry, JUMPM specifies the number of entries in the + * sequence to jump over. When JUMPM is less or equal than zero, + * A and C are not computed, IRANM is set to IRANN corresponding + * to a jump of size zero. + * + * MULT (local input) unint64_t + * On entry, MULT is an array of dimension 2, that contains the + * 16-lower and 15-higher bits of the constant a to jump from + * X(n) to X(n+1) = a*X(n) + c in the random sequence. + * + * IADD (local input) unint64_t + * On entry, IADD is an array of dimension 2, that contains the + * 16-lower and 15-higher bits of the constant c to jump from + * X(n) to X(n+1) = a*X(n) + c in the random sequence. + * + * IRANN (local input) unint64_t + * On entry, IRANN is an array of dimension 2. that contains the + * 16-lower and 15-higher bits of the encoding of X(n). + * + * IRANM (local output) unint64_t + * On entry, IRANM is an array of dimension 2. On exit, this + * array contains respectively the 16-lower and 15-higher bits + * of the encoding of X(n+JUMPM). + * + * IAM (local output) unint64_t + * On entry, IAM is an array of dimension 2. On exit, when JUMPM + * is greater than zero, this array contains the encoded + * constant A to jump from X(n) to X(n+JUMPM) in the random + * sequence. IAM(0:1) contains respectively the 16-lower and + * 15-higher bits of this constant A. When JUMPM is less or + * equal than zero, this array is not referenced. + * + * ICM (local output) unint64_t + * On entry, ICM is an array of dimension 2. On exit, when JUMPM + * is greater than zero, this array contains the encoded + * constant C to jump from X(n) to X(n+JUMPM) in the random + * sequence. ICM(0:1) contains respectively the 16-lower and + * 15-higher bits of this constant C. When JUMPM is less or + * equal than zero, this array is not referenced. + * + * --------------------------------------------------------------------- + */ + if(JUMPM > 0) { + IAM = MULT; + ICM = IADD; + for(int k = 1; k <= JUMPM - 1; k++) { + IAM *= MULT; + ICM = ICM * MULT + IADD; + } + IRANM = IRANN * IAM + ICM; + } else { + IRANM = IRANN; + } +} diff --git a/src/panel/HPL_pdpanel_SendToDevice.cpp b/src/panel/HPL_pdpanel_SendToDevice.cpp new file mode 100644 index 0000000..30f7e12 --- /dev/null +++ b/src/panel/HPL_pdpanel_SendToDevice.cpp @@ -0,0 +1,216 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#include "hpl.hpp" + +void HPL_unroll_ipiv(const int mp, + const int jb, + int* ipiv, + int* ipiv_ex, + int* upiv) { + + for(int i = 0; i < mp; i++) { upiv[i] = i; } // initialize ids + for(int i = 0; i < jb; i++) { // swap ids + int id = upiv[i]; + upiv[i] = upiv[ipiv[i]]; + upiv[ipiv[i]] = id; + } + + for(int i = 0; i < jb; i++) { ipiv_ex[i] = -1; } + + int cnt = 0; + for(int i = jb; i < mp; i++) { // find swapped ids outside of panel + if(upiv[i] < jb) { ipiv_ex[upiv[i]] = i; } + } +} + +void HPL_pdpanel_SendToDevice(HPL_T_panel* PANEL) { + double *A, *dA; + int jb, i, ml2; + + jb = PANEL->jb; + + if(jb <= 0) return; + + // only the root column copies to device + if(PANEL->grid->mycol == PANEL->pcol) { + + if(PANEL->grid->nprow == 1) { + + // unroll pivoting and send to device now + int* ipiv = PANEL->ipiv; + int* ipiv_ex = PANEL->ipiv + jb; + int* upiv = PANEL->IWORK + jb; // scratch space + + for(i = 0; i < jb; i++) { ipiv[i] -= PANEL->ii; } // shift + HPL_unroll_ipiv(PANEL->mp, jb, ipiv, ipiv_ex, upiv); + + int* dipiv = PANEL->dipiv; + int* dipiv_ex = PANEL->dipiv + jb; + + hipMemcpy2DAsync(dipiv, + jb * sizeof(int), + upiv, + jb * sizeof(int), + jb * sizeof(int), + 1, + hipMemcpyHostToDevice, + dataStream); + hipMemcpy2DAsync(dipiv_ex, + jb * sizeof(int), + ipiv_ex, + jb * sizeof(int), + jb * sizeof(int), + 1, + hipMemcpyHostToDevice, + dataStream); + + } else { + + int k = (int)((unsigned int)(jb) << 1); + int* iflag = PANEL->IWORK; + int* ipl = iflag + 1; + int* ipID = ipl + 1; + int* ipA = ipID + ((unsigned int)(k) << 1); + int* iplen = ipA + 1; + int* ipmap = iplen + PANEL->grid->nprow + 1; + int* ipmapm1 = ipmap + PANEL->grid->nprow; + int* upiv = ipmapm1 + PANEL->grid->nprow; + int* iwork = upiv + PANEL->mp; + + int* lindxU = PANEL->lindxU; + int* lindxA = PANEL->lindxA; + int* lindxAU = PANEL->lindxAU; + int* permU = PANEL->permU; + int* permU_ex = permU + jb; + int* ipiv = PANEL->ipiv; + + int* dlindxU = PANEL->dlindxU; + int* dlindxA = PANEL->dlindxA; + int* dlindxAU = PANEL->dlindxAU; + int* dpermU = PANEL->dpermU; + int* dpermU_ex = dpermU + jb; + int* dipiv = PANEL->dipiv; + + if(*iflag == -1) /* no index arrays have been computed so far */ + { + HPL_pipid(PANEL, ipl, ipID); + HPL_plindx(PANEL, + *ipl, + ipID, + ipA, + lindxU, + lindxAU, + lindxA, + iplen, + permU, + iwork); + *iflag = 1; + } + + int N = Mmax(*ipA, jb); + if(N > 0) { + hipMemcpy2DAsync(dlindxA, + k * sizeof(int), + lindxA, + k * sizeof(int), + N * sizeof(int), + 1, + hipMemcpyHostToDevice, + dataStream); + hipMemcpy2DAsync(dlindxAU, + k * sizeof(int), + lindxAU, + k * sizeof(int), + N * sizeof(int), + 1, + hipMemcpyHostToDevice, + dataStream); + } + + hipMemcpyAsync( + dlindxU, lindxU, jb * sizeof(int), hipMemcpyHostToDevice, dataStream); + + hipMemcpy2DAsync(dpermU, + jb * sizeof(int), + permU, + jb * sizeof(int), + jb * sizeof(int), + 1, + hipMemcpyHostToDevice, + dataStream); + + // send the ipivs along with L2 in the Bcast + hipMemcpy2DAsync(dipiv, + jb * sizeof(int), + ipiv, + jb * sizeof(int), + jb * sizeof(int), + 1, + hipMemcpyHostToDevice, + dataStream); + } + } + + // copy A and/or L2 + if(PANEL->grid->mycol == PANEL->pcol) { + // copy L1 + hipMemcpy2DAsync(PANEL->dL1, + jb * sizeof(double), + PANEL->L1, + jb * sizeof(double), + jb * sizeof(double), + jb, + hipMemcpyHostToDevice, + dataStream); + + if(PANEL->grid->npcol > 1) { // L2 is its own array + if(PANEL->grid->myrow == PANEL->prow) { + hipMemcpy2DAsync(Mptr(PANEL->dA, 0, -jb, PANEL->dlda), + PANEL->dlda * sizeof(double), + Mptr(PANEL->A, 0, 0, PANEL->lda), + PANEL->lda * sizeof(double), + jb * sizeof(double), + jb, + hipMemcpyHostToDevice, + dataStream); + + if((PANEL->mp - jb) > 0) + hipMemcpy2DAsync(PANEL->dL2, + PANEL->dldl2 * sizeof(double), + Mptr(PANEL->A, jb, 0, PANEL->lda), + PANEL->lda * sizeof(double), + (PANEL->mp - jb) * sizeof(double), + jb, + hipMemcpyHostToDevice, + dataStream); + } else { + if((PANEL->mp) > 0) + hipMemcpy2DAsync(PANEL->dL2, + PANEL->dldl2 * sizeof(double), + Mptr(PANEL->A, 0, 0, PANEL->lda), + PANEL->lda * sizeof(double), + PANEL->mp * sizeof(double), + jb, + hipMemcpyHostToDevice, + dataStream); + } + } else { + if(PANEL->mp > 0) + hipMemcpy2DAsync(Mptr(PANEL->dA, 0, -jb, PANEL->dlda), + PANEL->dlda * sizeof(double), + Mptr(PANEL->A, 0, 0, PANEL->lda), + PANEL->lda * sizeof(double), + PANEL->mp * sizeof(double), + jb, + hipMemcpyHostToDevice, + dataStream); + } + } +} diff --git a/src/panel/HPL_pdpanel_SendToHost.cpp b/src/panel/HPL_pdpanel_SendToHost.cpp new file mode 100644 index 0000000..e8a496f --- /dev/null +++ b/src/panel/HPL_pdpanel_SendToHost.cpp @@ -0,0 +1,28 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#include "hpl.hpp" + +void HPL_pdpanel_SendToHost(HPL_T_panel* PANEL) { + int jb; + + jb = PANEL->jb; + + if((PANEL->grid->mycol != PANEL->pcol) || (jb <= 0)) return; + + if(PANEL->mp > 0) + hipMemcpy2DAsync(PANEL->A, + PANEL->lda * sizeof(double), + PANEL->dA, + PANEL->dlda * sizeof(double), + PANEL->mp * sizeof(double), + jb, + hipMemcpyDeviceToHost, + dataStream); +} diff --git a/src/panel/HPL_pdpanel_bcast.cpp b/src/panel/HPL_pdpanel_bcast.cpp new file mode 100644 index 0000000..32b0bf3 --- /dev/null +++ b/src/panel/HPL_pdpanel_bcast.cpp @@ -0,0 +1,56 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_pdpanel_bcast(HPL_T_panel* PANEL) { + /* + * Purpose + * ======= + * + * HPL_pdpanel_bcast broadcasts the current panel. Successful completion + * is indicated by a return code of HPL_SUCCESS. + * + * Arguments + * ========= + * + * PANEL (input/output) HPL_T_panel * + * On entry, PANEL points to the current panel data structure + * being broadcast. + * + * --------------------------------------------------------------------- + */ + + if(PANEL == NULL) { return HPL_SUCCESS; } + if(PANEL->grid->npcol <= 1) { return HPL_SUCCESS; } + + MPI_Comm comm = PANEL->grid->row_comm; + int root = PANEL->pcol; + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_LBCAST); +#endif + /* + * Single Bcast call + */ + int err = HPL_bcast(PANEL->dL2, PANEL->len, root, comm, PANEL->algo->btopo); + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_LBCAST); +#endif + + return err; +} diff --git a/src/panel/HPL_pdpanel_disp.cpp b/src/panel/HPL_pdpanel_disp.cpp new file mode 100644 index 0000000..cd589e0 --- /dev/null +++ b/src/panel/HPL_pdpanel_disp.cpp @@ -0,0 +1,48 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_pdpanel_disp(HPL_T_panel** PANEL) { + /* + * Purpose + * ======= + * + * HPL_pdpanel_disp deallocates the panel structure and resources and + * stores the error code returned by the panel factorization. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * * + * On entry, PANEL points to the address of the panel data + * structure to be deallocated. + * + * --------------------------------------------------------------------- + */ + + int mpierr; + + /* + * Deallocate the panel resources and panel structure + */ + (*PANEL)->free_work_now = 1; + mpierr = HPL_pdpanel_free(*PANEL); + if(*PANEL) free(*PANEL); + *PANEL = NULL; + + return (mpierr); +} diff --git a/src/panel/HPL_pdpanel_free.cpp b/src/panel/HPL_pdpanel_free.cpp new file mode 100644 index 0000000..dcdc9b8 --- /dev/null +++ b/src/panel/HPL_pdpanel_free.cpp @@ -0,0 +1,56 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#include "hpl.hpp" + +int HPL_pdpanel_free(HPL_T_panel* PANEL) { + /* + * Purpose + * ======= + * + * HPL_pdpanel_free deallocates the panel resources and stores the error + * code returned by the panel factorization. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the panel data structure from + * which the resources should be deallocated. + * + * --------------------------------------------------------------------- + */ + + if(PANEL->pmat->info == 0) PANEL->pmat->info = *(PANEL->DINFO); + + if(PANEL->free_work_now == 1) { + + if(PANEL->dLWORK) hipFree(PANEL->dLWORK); + if(PANEL->dUWORK) hipFree(PANEL->dUWORK); + if(PANEL->LWORK) hipHostFree(PANEL->LWORK); + if(PANEL->UWORK) hipHostFree(PANEL->UWORK); + + PANEL->max_lwork_size = 0; + PANEL->max_uwork_size = 0; + + if(PANEL->IWORK) free(PANEL->IWORK); + if(PANEL->fWORK) free(PANEL->fWORK); + + PANEL->max_iwork_size = 0; + PANEL->max_fwork_size = 0; + } + + return (HPL_SUCCESS); +} diff --git a/src/panel/HPL_pdpanel_init.cpp b/src/panel/HPL_pdpanel_init.cpp new file mode 100644 index 0000000..584f7f4 --- /dev/null +++ b/src/panel/HPL_pdpanel_init.cpp @@ -0,0 +1,475 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include + +static int Malloc(HPL_T_grid* GRID, void** ptr, const size_t bytes) { + + int mycol, myrow, npcol, nprow; + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + + unsigned long pg_size = sysconf(_SC_PAGESIZE); + int err = posix_memalign(ptr, pg_size, bytes); + + /*Check workspace allocation is valid*/ + if(err != 0) { + return HPL_FAILURE; + } else { + return HPL_SUCCESS; + } +} + +static int hostMalloc(HPL_T_grid* GRID, void** ptr, const size_t bytes) { + + int mycol, myrow, npcol, nprow; + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + + hipError_t err = hipHostMalloc(ptr, bytes); + + /*Check workspace allocation is valid*/ + if(err != hipSuccess) { + return HPL_FAILURE; + } else { + return HPL_SUCCESS; + } +} + +static int deviceMalloc(HPL_T_grid* GRID, void** ptr, const size_t bytes) { + + int mycol, myrow, npcol, nprow; + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + + hipError_t err = hipMalloc(ptr, bytes); + + /*Check workspace allocation is valid*/ + if(err != hipSuccess) { + return HPL_FAILURE; + } else { + return HPL_SUCCESS; + } +} + +void HPL_pdpanel_init(HPL_T_grid* GRID, + HPL_T_palg* ALGO, + const int M, + const int N, + const int JB, + HPL_T_pmat* A, + const int IA, + const int JA, + const int TAG, + HPL_T_panel* PANEL) { + /* + * Purpose + * ======= + * + * HPL_pdpanel_init initializes a panel data structure. + * + * + * Arguments + * ========= + * + * GRID (local input) HPL_T_grid * + * On entry, GRID points to the data structure containing the + * process grid information. + * + * ALGO (global input) HPL_T_palg * + * On entry, ALGO points to the data structure containing the + * algorithmic parameters. + * + * M (local input) const int + * On entry, M specifies the global number of rows of the panel. + * M must be at least zero. + * + * N (local input) const int + * On entry, N specifies the global number of columns of the + * panel and trailing submatrix. N must be at least zero. + * + * JB (global input) const int + * On entry, JB specifies is the number of columns of the panel. + * JB must be at least zero. + * + * A (local input/output) HPL_T_pmat * + * On entry, A points to the data structure containing the local + * array information. + * + * IA (global input) const int + * On entry, IA is the global row index identifying the panel + * and trailing submatrix. IA must be at least zero. + * + * JA (global input) const int + * On entry, JA is the global column index identifying the panel + * and trailing submatrix. JA must be at least zero. + * + * TAG (global input) const int + * On entry, TAG is the row broadcast message id. + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * --------------------------------------------------------------------- + */ + + size_t dalign; + int icurcol, icurrow, ii, itmp1, jj, lwork, uwork, ml2, mp, mycol, myrow, nb, + npcol, nprow, nq, nu, ldu; + + PANEL->grid = GRID; /* ptr to the process grid */ + PANEL->algo = ALGO; /* ptr to the algo parameters */ + PANEL->pmat = A; /* ptr to the local array info */ + + myrow = GRID->myrow; + mycol = GRID->mycol; + nprow = GRID->nprow; + npcol = GRID->npcol; + nb = A->nb; + + HPL_infog2l(IA, + JA, + nb, + nb, + nb, + nb, + 0, + 0, + myrow, + mycol, + nprow, + npcol, + &ii, + &jj, + &icurrow, + &icurcol); + mp = HPL_numrocI(M, IA, nb, nb, myrow, 0, nprow); + nq = HPL_numrocI(N, JA, nb, nb, mycol, 0, npcol); + + const int inxtcol = MModAdd1(icurcol, npcol); + const int inxtrow = MModAdd1(icurrow, nprow); + + /* ptr to trailing part of A */ + PANEL->A = A->A; + PANEL->dA = Mptr((double*)(A->dA), ii, jj, A->ld); + + /* + * Workspace pointers are initialized to NULL. + */ + PANEL->L2 = nullptr; + PANEL->dL2 = nullptr; + PANEL->L1 = nullptr; + PANEL->dL1 = nullptr; + PANEL->DINFO = nullptr; + PANEL->U = nullptr; + PANEL->dU = nullptr; + PANEL->W = nullptr; + PANEL->dW = nullptr; + PANEL->U1 = nullptr; + PANEL->dU1 = nullptr; + PANEL->W1 = nullptr; + PANEL->dW1 = nullptr; + PANEL->U2 = nullptr; + PANEL->dU2 = nullptr; + PANEL->W2 = nullptr; + PANEL->dW2 = nullptr; + // PANEL->WORK = NULL; + // PANEL->IWORK = NULL; + /* + * Local lengths, indexes process coordinates + */ + PANEL->nb = nb; /* distribution blocking factor */ + PANEL->jb = JB; /* panel width */ + PANEL->m = M; /* global # of rows of trailing part of A */ + PANEL->n = N; /* global # of cols of trailing part of A */ + PANEL->ia = IA; /* global row index of trailing part of A */ + PANEL->ja = JA; /* global col index of trailing part of A */ + PANEL->mp = mp; /* local # of rows of trailing part of A */ + PANEL->nq = nq; /* local # of cols of trailing part of A */ + PANEL->ii = ii; /* local row index of trailing part of A */ + PANEL->jj = jj; /* local col index of trailing part of A */ + PANEL->lda = A->ld; /* local leading dim of array A */ + PANEL->dlda = A->ld; /* local leading dim of array A */ + PANEL->prow = icurrow; /* proc row owning 1st row of trailing A */ + PANEL->pcol = icurcol; /* proc col owning 1st col of trailing A */ + PANEL->msgid = TAG; /* message id to be used for panel bcast */ + /* + * Initialize ldl2 and len to temporary dummy values and Update tag for + * next panel + */ + PANEL->ldl2 = 0; /* local leading dim of array L2 */ + PANEL->dldl2 = 0; /* local leading dim of array L2 */ + PANEL->len = 0; /* length of the buffer to broadcast */ + PANEL->nu0 = 0; + PANEL->nu1 = 0; + PANEL->nu2 = 0; + PANEL->ldu0 = 0; + PANEL->ldu1 = 0; + PANEL->ldu2 = 0; + + /* + * Figure out the exact amount of workspace needed by the factorization + * and the update - Allocate that space - Finish the panel data structu- + * re initialization. + * + * L1: JB x JB in all processes + * DINFO: 1 in all processes + * + * We also make an array of necessary intergers for swaps in the update. + * + * If nprow is 1, we just allocate an array of 2*JB integers for the swap. + * When nprow > 1, we allocate the space for the index arrays immediate- + * ly. The exact size of this array depends on the swapping routine that + * will be used, so we allocate the maximum: + * + * lindxU is of size JB + + * lindxA is of size at most JB + + * lindxAU is of size at most JB + + * permU is of size at most JB + * + * ipiv is of size at most JB + * + * that is 5*JB. + * + * We make sure that those three arrays are contiguous in memory for the + * later panel broadcast (using type punning to put the integer array at + * the end. We also choose to put this amount of space right after + * L2 (when it exist) so that one can receive a contiguous buffer. + */ + + /*Split fraction*/ + const double fraction = ALGO->frac; + + dalign = ALGO->align * sizeof(double); + size_t lpiv = (5 * JB * sizeof(int) + sizeof(double) - 1) / (sizeof(double)); + + if(npcol > 1) { + ml2 = (myrow == icurrow ? mp - JB : mp); + ml2 = Mmax(0, ml2); + ml2 = ((ml2 + 95) / 128) * 128 + 32; /*pad*/ + } else { + ml2 = 0; // L2 is aliased inside A + } + + /* Size of LBcast message */ + PANEL->len = ml2 * JB + JB * JB + lpiv; // L2, L1, integer arrays + + /* space for L */ + lwork = PANEL->len + 1; + + nu = Mmax(0, (mycol == icurcol ? nq - JB : nq)); + ldu = nu + JB + 256; /*extra space for potential padding*/ + + /* space for U */ + uwork = JB * ldu; + + if(PANEL->max_lwork_size < (size_t)(lwork) * sizeof(double)) { + if(PANEL->LWORK) { + hipFree(PANEL->dLWORK); + free(PANEL->LWORK); + } + // size_t numbytes = (((size_t)((size_t)(lwork) * sizeof( double )) + + // (size_t)4095)/(size_t)4096)*(size_t)4096; + size_t numbytes = (size_t)(lwork) * sizeof(double); + + if(deviceMalloc(GRID, (void**)&(PANEL->dLWORK), numbytes) != HPL_SUCCESS) { + HPL_pabort(__LINE__, + "HPL_pdpanel_init", + "Device memory allocation failed for L workspace."); + } + if(hostMalloc(GRID, (void**)&(PANEL->LWORK), numbytes) != HPL_SUCCESS) { + HPL_pabort(__LINE__, + "HPL_pdpanel_init", + "Host memory allocation failed for L workspace."); + } + + PANEL->max_lwork_size = (size_t)(lwork) * sizeof(double); + } + if(PANEL->max_uwork_size < (size_t)(uwork) * sizeof(double)) { + if(PANEL->UWORK) { + hipFree(PANEL->dUWORK); + free(PANEL->UWORK); + } + // size_t numbytes = (((size_t)((size_t)(uwork) * sizeof( double )) + + // (size_t)4095)/(size_t)4096)*(size_t)4096; + size_t numbytes = (size_t)(uwork) * sizeof(double); + + if(deviceMalloc(GRID, (void**)&(PANEL->dUWORK), numbytes) != HPL_SUCCESS) { + HPL_pabort(__LINE__, + "HPL_pdpanel_init", + "Device memory allocation failed for U workspace."); + } + if(hostMalloc(GRID, (void**)&(PANEL->UWORK), numbytes) != HPL_SUCCESS) { + HPL_pabort(__LINE__, + "HPL_pdpanel_init", + "Host memory allocation failed for U workspace."); + } + + PANEL->max_uwork_size = (size_t)(uwork) * sizeof(double); + } + + /* + * Initialize the pointers of the panel structure + */ + if(npcol == 1) { + PANEL->L2 = PANEL->A + (myrow == icurrow ? JB : 0); + PANEL->dL2 = PANEL->dA + (myrow == icurrow ? JB : 0); + PANEL->ldl2 = A->ld; + PANEL->dldl2 = A->ld; /*L2 is aliased inside A*/ + + PANEL->L1 = (double*)PANEL->LWORK; + PANEL->dL1 = (double*)PANEL->dLWORK; + } else { + PANEL->L2 = (double*)PANEL->LWORK; + PANEL->dL2 = (double*)PANEL->dLWORK; + PANEL->ldl2 = Mmax(0, ml2); + PANEL->dldl2 = Mmax(0, ml2); + + PANEL->L1 = PANEL->L2 + ml2 * JB; + PANEL->dL1 = PANEL->dL2 + ml2 * JB; + } + + PANEL->U = (double*)PANEL->UWORK; + PANEL->dU = (double*)PANEL->dUWORK; + PANEL->W = A->W; + PANEL->dW = A->dW; + + if(nprow == 1) { + PANEL->nu0 = (mycol == inxtcol) ? Mmin(JB, nu) : 0; + PANEL->ldu0 = PANEL->nu0; + + PANEL->nu1 = 0; + PANEL->ldu1 = 0; + + PANEL->nu2 = nu - PANEL->nu0; + PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/ + + PANEL->U2 = PANEL->U + JB * JB; + PANEL->dU2 = PANEL->dU + JB * JB; + PANEL->U1 = PANEL->U2 + PANEL->ldu2 * JB; + PANEL->dU1 = PANEL->dU2 + PANEL->ldu2 * JB; + + PANEL->permU = (int*)(PANEL->L1 + JB * JB); + PANEL->dpermU = (int*)(PANEL->dL1 + JB * JB); + PANEL->ipiv = PANEL->permU + JB; + PANEL->dipiv = PANEL->dpermU + JB; + + PANEL->DINFO = (double*)(PANEL->ipiv + 2 * JB); + PANEL->dDINFO = (double*)(PANEL->dipiv + 2 * JB); + } else { + const int NSplit = Mmax(0, ((((int)(A->nq * fraction)) / nb) * nb)); + PANEL->nu0 = (mycol == inxtcol) ? Mmin(JB, nu) : 0; + PANEL->ldu0 = PANEL->nu0; + + PANEL->nu2 = Mmin(nu - PANEL->nu0, NSplit); + PANEL->ldu2 = ((PANEL->nu2 + 95) / 128) * 128 + 32; /*pad*/ + + PANEL->nu1 = nu - PANEL->nu0 - PANEL->nu2; + PANEL->ldu1 = ((PANEL->nu1 + 95) / 128) * 128 + 32; /*pad*/ + + PANEL->U2 = PANEL->U + JB * JB; + PANEL->dU2 = PANEL->dU + JB * JB; + PANEL->U1 = PANEL->U2 + PANEL->ldu2 * JB; + PANEL->dU1 = PANEL->dU2 + PANEL->ldu2 * JB; + + PANEL->W2 = PANEL->W + JB * JB; + PANEL->dW2 = PANEL->dW + JB * JB; + PANEL->W1 = PANEL->W2 + PANEL->ldu2 * JB; + PANEL->dW1 = PANEL->dW2 + PANEL->ldu2 * JB; + + PANEL->lindxA = (int*)(PANEL->L1 + JB * JB); + PANEL->dlindxA = (int*)(PANEL->dL1 + JB * JB); + PANEL->lindxAU = PANEL->lindxA + JB; + PANEL->dlindxAU = PANEL->dlindxA + JB; + PANEL->lindxU = PANEL->lindxAU + JB; + PANEL->dlindxU = PANEL->dlindxAU + JB; + PANEL->permU = PANEL->lindxU + JB; + PANEL->dpermU = PANEL->dlindxU + JB; + + // Put ipiv array at the end + PANEL->ipiv = PANEL->permU + JB; + PANEL->dipiv = PANEL->dpermU + JB; + + PANEL->DINFO = ((double*)PANEL->lindxA) + lpiv; + PANEL->dDINFO = ((double*)PANEL->dlindxA) + lpiv; + } + + *(PANEL->DINFO) = 0.0; + + /* + * If nprow is 1, we just allocate an array of JB integers to store the + * pivot IDs during factoring, and a scratch array of mp integers. + * When nprow > 1, we allocate the space for the index arrays immediate- + * ly. The exact size of this array depends on the swapping routine that + * will be used, so we allocate the maximum: + * + * IWORK[0] is of size at most 1 + + * IPL is of size at most 1 + + * IPID is of size at most 4 * JB + + * IPIV is of size at most JB + + * SCRATCH is of size at most MP + * + * ipA is of size at most 1 + + * iplen is of size at most NPROW + 1 + + * ipcounts is of size at most NPROW + + * ioffsets is of size at most NPROW + + * iwork is of size at most MAX( 2*JB, NPROW+1 ). + * + * that is mp + 4 + 5*JB + 3*NPROW + MAX( 2*JB, NPROW+1 ). + * + * We use the fist entry of this to work array to indicate whether the + * the local index arrays have already been computed, and if yes, by + * which function: + * IWORK[0] = -1: no index arrays have been computed so far; + * IWORK[0] = 1: HPL_pdlaswp already computed those arrays; + * This allows to save some redundant and useless computations. + */ + if(nprow == 1) { + lwork = mp + JB; + } else { + itmp1 = (JB << 1); + lwork = nprow + 1; + itmp1 = Mmax(itmp1, lwork); + lwork = mp + 4 + (5 * JB) + (3 * nprow) + itmp1; + } + + if(PANEL->max_iwork_size < (size_t)(lwork) * sizeof(int)) { + if(PANEL->IWORK) { free(PANEL->IWORK); } + size_t numbytes = (size_t)(lwork) * sizeof(int); + + if(Malloc(GRID, (void**)&(PANEL->IWORK), numbytes) != HPL_SUCCESS) { + HPL_pabort(__LINE__, + "HPL_pdpanel_init", + "Host memory allocation failed for integer workspace."); + } + PANEL->max_iwork_size = (size_t)(lwork) * sizeof(int); + } + + if(lwork) *(PANEL->IWORK) = -1; + + /*Finally, we need 4 + 4*JB entries of scratch for pdfact */ + lwork = (size_t)(((4 + ((unsigned int)(JB) << 1)) << 1)); + if(PANEL->max_fwork_size < (size_t)(lwork) * sizeof(double)) { + if(PANEL->fWORK) { free(PANEL->fWORK); } + size_t numbytes = (size_t)(lwork) * sizeof(double); + + if(Malloc(GRID, (void**)&(PANEL->fWORK), numbytes) != HPL_SUCCESS) { + HPL_pabort(__LINE__, + "HPL_pdpanel_init", + "Host memory allocation failed for pdfact scratch workspace."); + } + PANEL->max_fwork_size = (size_t)(lwork) * sizeof(double); + } +} diff --git a/src/panel/HPL_pdpanel_new.cpp b/src/panel/HPL_pdpanel_new.cpp new file mode 100644 index 0000000..f790182 --- /dev/null +++ b/src/panel/HPL_pdpanel_new.cpp @@ -0,0 +1,105 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdpanel_new(HPL_T_grid* GRID, + HPL_T_palg* ALGO, + const int M, + const int N, + const int JB, + HPL_T_pmat* A, + const int IA, + const int JA, + const int TAG, + HPL_T_panel** PANEL) { + /* + * Purpose + * ======= + * + * HPL_pdpanel_new creates and initializes a panel data structure. + * + * + * Arguments + * ========= + * + * GRID (local input) HPL_T_grid * + * On entry, GRID points to the data structure containing the + * process grid information. + * + * ALGO (global input) HPL_T_palg * + * On entry, ALGO points to the data structure containing the + * algorithmic parameters. + * + * M (local input) const int + * On entry, M specifies the global number of rows of the panel. + * M must be at least zero. + * + * N (local input) const int + * On entry, N specifies the global number of columns of the + * panel and trailing submatrix. N must be at least zero. + * + * JB (global input) const int + * On entry, JB specifies is the number of columns of the panel. + * JB must be at least zero. + * + * A (local input/output) HPL_T_pmat * + * On entry, A points to the data structure containing the local + * array information. + * + * IA (global input) const int + * On entry, IA is the global row index identifying the panel + * and trailing submatrix. IA must be at least zero. + * + * JA (global input) const int + * On entry, JA is the global column index identifying the panel + * and trailing submatrix. JA must be at least zero. + * + * TAG (global input) const int + * On entry, TAG is the row broadcast message id. + * + * PANEL (local input/output) HPL_T_panel * * + * On entry, PANEL points to the address of the panel data + * structure to create and initialize. + * + * --------------------------------------------------------------------- + */ + + HPL_T_panel* p = NULL; + + /* + * Allocate the panel structure - Check for enough memory + */ + if(!(p = (HPL_T_panel*)malloc(sizeof(HPL_T_panel)))) { + HPL_pabort(__LINE__, "HPL_pdpanel_new", "Memory allocation failed"); + } + + p->max_pinned_work_size = 0; + p->max_lwork_size = 0; + p->max_uwork_size = 0; + p->max_iwork_size = 0; + p->max_fwork_size = 0; + p->free_work_now = 0; + p->A = NULL; + p->LWORK = NULL; + p->dLWORK = NULL; + p->UWORK = NULL; + p->dUWORK = NULL; + p->fWORK = NULL; + p->IWORK = NULL; + HPL_pdpanel_init(GRID, ALGO, M, N, JB, A, IA, JA, TAG, p); + *PANEL = p; +} diff --git a/src/panel/HPL_pdpanel_wait.cpp b/src/panel/HPL_pdpanel_wait.cpp new file mode 100644 index 0000000..5bc6bcf --- /dev/null +++ b/src/panel/HPL_pdpanel_wait.cpp @@ -0,0 +1,22 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ +#include "hpl.hpp" + +void HPL_pdpanel_Wait(HPL_T_panel* PANEL) { + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_COPY); +#endif + // Wait for panel + hipStreamSynchronize(dataStream); +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_COPY); +#endif +} diff --git a/src/pauxil/HPL_dlaswp00N_device.cpp b/src/pauxil/HPL_dlaswp00N_device.cpp new file mode 100644 index 0000000..8819a82 --- /dev/null +++ b/src/pauxil/HPL_dlaswp00N_device.cpp @@ -0,0 +1,111 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include + +#define BLOCK_SIZE 512 + +__global__ void dlaswp00N(const int N, + const int M, + double* __restrict__ A, + const int LDA, + const int* __restrict__ IPIV) { + + __shared__ double s_An_init[2048]; + __shared__ double s_An_ipiv[2048]; + + const int m = threadIdx.x; + const int n = blockIdx.x; + + // read in block column + for(int i = m; i < M; i += blockDim.x) + s_An_init[i] = A[i + n * ((size_t)LDA)]; + + __syncthreads(); + + // local block + for(int i = m; i < M; i += blockDim.x) { + const int ip = IPIV[i]; + + if(ip < M) { // local swap + s_An_ipiv[i] = s_An_init[ip]; + } else { // non local swap + s_An_ipiv[i] = A[ip + n * ((size_t)LDA)]; + } + } + __syncthreads(); + + // write out local block + for(int i = m; i < M; i += blockDim.x) + A[i + n * ((size_t)LDA)] = s_An_ipiv[i]; + + // remaining swaps in column + for(int i = m; i < M; i += blockDim.x) { + const int ip_ex = IPIV[i + M]; + + if(ip_ex > -1) { A[ip_ex + n * ((size_t)LDA)] = s_An_init[i]; } + } +} + +void HPL_dlaswp00N(const int M, + const int N, + double* A, + const int LDA, + const int* IPIV) { + /* + * Purpose + * ======= + * + * HPL_dlaswp00N performs a series of local row interchanges on a matrix + * A. One row interchange is initiated for rows 0 through M-1 of A. + * + * Arguments + * ========= + * + * M (local input) const int + * On entry, M specifies the number of rows of the array A to be + * interchanged. M must be at least zero. + * + * N (local input) const int + * On entry, N specifies the number of columns of the array A. + * N must be at least zero. + * + * A (local input/output) double * + * On entry, A points to an array of dimension (LDA,N) to which + * the row interchanges will be applied. On exit, the permuted + * matrix. + * + * LDA (local input) const int + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least MAX(1,M). + * + * IPIV (local input) const int * + * On entry, IPIV is an array of size M that contains the + * pivoting information. For k in [0..M), IPIV[k]=IROFF + l + * implies that local rows k and l are to be interchanged. + * + * --------------------------------------------------------------------- + */ + + if((M <= 0) || (N <= 0)) return; + + hipStream_t stream; + rocblas_get_stream(handle, &stream); + + int grid_size = N; + dlaswp00N<<>>(N, M, A, LDA, IPIV); +} diff --git a/src/pauxil/HPL_dlaswp01T_device.cpp b/src/pauxil/HPL_dlaswp01T_device.cpp new file mode 100644 index 0000000..d858d47 --- /dev/null +++ b/src/pauxil/HPL_dlaswp01T_device.cpp @@ -0,0 +1,135 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include + +#define TILE_DIM 32 +#define BLOCK_ROWS 8 + +/* Build U matrix from rows of A */ +__global__ void dlaswp01T(const int M, + const int N, + double* __restrict__ A, + const int LDA, + double* __restrict__ U, + const int LDU, + const int* __restrict__ LINDXU) { + + __shared__ double s_U[TILE_DIM][TILE_DIM + 1]; + + const int m = threadIdx.x + TILE_DIM * blockIdx.x; + const int n = threadIdx.y + TILE_DIM * blockIdx.y; + + if(m < M) { + const int ipa = LINDXU[m]; + + // save in LDS for the moment + // possible cache-hits if ipas are close + s_U[threadIdx.x][threadIdx.y + 0] = + (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0; + s_U[threadIdx.x][threadIdx.y + 8] = + (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0; + s_U[threadIdx.x][threadIdx.y + 16] = + (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0; + s_U[threadIdx.x][threadIdx.y + 24] = + (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0; + } + + __syncthreads(); + + const int um = threadIdx.y + TILE_DIM * blockIdx.x; + const int un = threadIdx.x + TILE_DIM * blockIdx.y; + + if(un < N) { + // write out chunks of U + if((um + 0) < M) + U[un + (um + 0) * ((size_t)LDU)] = s_U[threadIdx.y + 0][threadIdx.x]; + if((um + 8) < M) + U[un + (um + 8) * ((size_t)LDU)] = s_U[threadIdx.y + 8][threadIdx.x]; + if((um + 16) < M) + U[un + (um + 16) * ((size_t)LDU)] = s_U[threadIdx.y + 16][threadIdx.x]; + if((um + 24) < M) + U[un + (um + 24) * ((size_t)LDU)] = s_U[threadIdx.y + 24][threadIdx.x]; + } +} + +void HPL_dlaswp01T(const int M, + const int N, + double* A, + const int LDA, + double* U, + const int LDU, + const int* LINDXU) { + /* + * Purpose + * ======= + * + * HPL_dlaswp01T copies scattered rows of A into an array U. The + * row offsets in A of the source rows are specified by LINDXU. + * Rows of A are stored as columns in U. + * + * Arguments + * ========= + * + * M (local input) const int + * On entry, M specifies the number of rows of A that should be + * moved within A or copied into U. M must be at least zero. + * + * N (local input) const int + * On entry, N specifies the length of rows of A that should be + * moved within A or copied into U. N must be at least zero. + * + * A (local input/output) double * + * On entry, A points to an array of dimension (LDA,N). The rows + * of this array specified by LINDXA should be moved within A or + * copied into U. + * + * LDA (local input) const int + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least MAX(1,M). + * + * U (local input/output) double * + * On entry, U points to an array of dimension (LDU,M). The rows + * of A specified by LINDXA are copied within this array U at + * the positions indicated by positive values of LINDXAU. The + * rows of A are stored as columns in U. + * + * LDU (local input) const int + * On entry, LDU specifies the leading dimension of the array U. + * LDU must be at least MAX(1,N). + * + * LINDXU (local input) const int * + * On entry, LINDXU is an array of dimension M that contains the + * local row indexes of A that should be copied into U. + * + * --------------------------------------------------------------------- + */ + /* + * .. Local Variables .. + */ + + if((M <= 0) || (N <= 0)) return; + + dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM); + dim3 block_size(TILE_DIM, BLOCK_ROWS); + dlaswp01T<<>>( + M, N, A, LDA, U, LDU, LINDXU); + + /* + * End of HPL_dlaswp01T + */ +} diff --git a/src/pauxil/HPL_dlaswp02T_device.cpp b/src/pauxil/HPL_dlaswp02T_device.cpp new file mode 100644 index 0000000..c950596 --- /dev/null +++ b/src/pauxil/HPL_dlaswp02T_device.cpp @@ -0,0 +1,106 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include +#include + +#define assertm(exp, msg) assert(((void)msg, exp)) + +/* Perform any local row swaps of A */ +__global__ void dlaswp02T(const int M, + const int N, + double* __restrict__ A, + const int LDA, + const int* __restrict__ LINDXAU, + const int* __restrict__ LINDXA) { + + const int n = blockIdx.x; + const int m = threadIdx.x; + + const int ipau = LINDXAU[m]; // src row + const int ipa = LINDXA[m]; // dst row + + const double An = A[ipau + n * ((size_t)LDA)]; + + __syncthreads(); + + A[ipa + n * ((size_t)LDA)] = An; +} + +void HPL_dlaswp02T(const int M, + const int N, + double* A, + const int LDA, + const int* LINDXAU, + const int* LINDXA) { + /* + * Purpose + * ======= + * + * HPL_dlaswp02T copies scattered rows of A into itself. The row + * offsets in A of the source rows are specified by LINDXA. + * The destination of those rows are specified by LINDXAU. A + * positive value of LINDXAU indicates that the array destination is U, + * and A otherwise. + * + * Arguments + * ========= + * + * M (local input) const int + * On entry, M specifies the number of rows of A that should be + * moved within A or copied into U. M must be at least zero. + * + * N (local input) const int + * On entry, N specifies the length of rows of A that should be + * moved within A or copied into U. N must be at least zero. + * + * A (local input/output) double * + * On entry, A points to an array of dimension (LDA,N). The rows + * of this array specified by LINDXA should be moved within A or + * copied into U. + * + * LDA (local input) const int + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least MAX(1,M). + * + * LINDXAU (local input) const int * + * On entry, LINDXA is an array of dimension M that contains the + * local row indexes of A that should be moved within A. + * + * LINDXA (local input) const int * + * On entry, LINDXAU is an array of dimension M that contains + * the local row indexes of A where the rows of A should be + * copied to. + * + * --------------------------------------------------------------------- + */ + /* + * .. Local Variables .. + */ + + if((M <= 0) || (N <= 0)) return; + + assertm(M <= 1024, "NB too large in HPL_dlaswp02T"); + + dim3 grid_size(N); + dim3 block_size(M); + dlaswp02T<<>>(M, N, A, LDA, LINDXAU, LINDXA); + + /* + * End of HPL_dlaswp02T + */ +} diff --git a/src/pauxil/HPL_dlaswp03T_device.cpp b/src/pauxil/HPL_dlaswp03T_device.cpp new file mode 100644 index 0000000..4264538 --- /dev/null +++ b/src/pauxil/HPL_dlaswp03T_device.cpp @@ -0,0 +1,133 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include + +#define TILE_DIM 32 +#define BLOCK_ROWS 8 + +/* Build W matrix from rows of A */ +__global__ void dlaswp03T(const int M, + const int N, + double* __restrict__ A, + const int LDA, + double* __restrict__ W, + const int LDW, + const int* __restrict__ LINDXU) { + + __shared__ double s_W[TILE_DIM][TILE_DIM + 1]; + + const int m = threadIdx.x + TILE_DIM * blockIdx.x; + const int n = threadIdx.y + TILE_DIM * blockIdx.y; + + if(m < M) { + const int ipa = LINDXU[m]; + + // save in LDS for the moment + // possible cache-hits if ipas are close + s_W[threadIdx.x][threadIdx.y + 0] = + (n + 0 < N) ? A[ipa + (n + 0) * ((size_t)LDA)] : 0.0; + s_W[threadIdx.x][threadIdx.y + 8] = + (n + 8 < N) ? A[ipa + (n + 8) * ((size_t)LDA)] : 0.0; + s_W[threadIdx.x][threadIdx.y + 16] = + (n + 16 < N) ? A[ipa + (n + 16) * ((size_t)LDA)] : 0.0; + s_W[threadIdx.x][threadIdx.y + 24] = + (n + 24 < N) ? A[ipa + (n + 24) * ((size_t)LDA)] : 0.0; + } + + __syncthreads(); + + const int wm = threadIdx.y + TILE_DIM * blockIdx.x; + const int wn = threadIdx.x + TILE_DIM * blockIdx.y; + + if(wn < N) { + // write out chunks of W + if((wm + 0) < M) + W[wn + (wm + 0) * ((size_t)LDW)] = s_W[threadIdx.y + 0][threadIdx.x]; + if((wm + 8) < M) + W[wn + (wm + 8) * ((size_t)LDW)] = s_W[threadIdx.y + 8][threadIdx.x]; + if((wm + 16) < M) + W[wn + (wm + 16) * ((size_t)LDW)] = s_W[threadIdx.y + 16][threadIdx.x]; + if((wm + 24) < M) + W[wn + (wm + 24) * ((size_t)LDW)] = s_W[threadIdx.y + 24][threadIdx.x]; + } +} + +void HPL_dlaswp03T(const int M, + const int N, + double* A, + const int LDA, + double* W, + const int LDW, + const int* LINDXU) { + /* + * Purpose + * ======= + * + * HPL_dlaswp03T packs scattered rows of an array A into workspace W. + * The row offsets in A are specified by LINDXU. + * + * Arguments + * ========= + * + * M (local input) const int + * On entry, M specifies the number of rows of A that should be + * swapped with columns of W. M must be at least zero. + * + * N (local input) const int + * On entry, N specifies the length of the rows of A that should + * be swapped with columns of W. N must be at least zero. + * + * A (local output) double * + * On entry, A points to an array of dimension (LDA,N). On exit, + * the rows of this array specified by LINDXU are replaced by + * columns of W. + * + * LDA (local input) const int + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least MAX(1,M). + * + * W (local input/output) double * + * On entry, W points to an array of dimension (LDW,*). This + * array contains the columns of W that are to be swapped with + * rows of A. + * + * LDW (local input) const int + * On entry, LDW specifies the leading dimension of the array W. + * LDW must be at least MAX(1,N). + * + * LINDXU (local input) const int * + * On entry, LINDXU is an array of dimension M that contains the + * local row indexes of A that should be copied into W. + * + * --------------------------------------------------------------------- + */ + /* + * .. Local Variables .. + */ + + if((M <= 0) || (N <= 0)) return; + + dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM); + dim3 block_size(TILE_DIM, BLOCK_ROWS); + dlaswp03T<<>>( + M, N, A, LDA, W, LDW, LINDXU); + + /* + * End of HPL_dlaswp03T + */ +} diff --git a/src/pauxil/HPL_dlaswp04T_device.cpp b/src/pauxil/HPL_dlaswp04T_device.cpp new file mode 100644 index 0000000..ca94a00 --- /dev/null +++ b/src/pauxil/HPL_dlaswp04T_device.cpp @@ -0,0 +1,128 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include + +#define TILE_DIM 32 +#define BLOCK_ROWS 8 + +static __global__ void dlaswp04T(const int M, + const int N, + double* __restrict__ A, + const int LDA, + double* __restrict__ W, + const int LDW, + const int* __restrict__ LINDXU) { + + __shared__ double s_W[TILE_DIM][TILE_DIM + 1]; + + const int am = threadIdx.x + TILE_DIM * blockIdx.x; + const int an = threadIdx.y + TILE_DIM * blockIdx.y; + + const int wm = threadIdx.y + TILE_DIM * blockIdx.x; + const int wn = threadIdx.x + TILE_DIM * blockIdx.y; + + if(wn < N) { + s_W[threadIdx.y + 0][threadIdx.x] = + (wm + 0 < M) ? W[wn + (wm + 0) * ((size_t)LDW)] : 0.0; + s_W[threadIdx.y + 8][threadIdx.x] = + (wm + 8 < M) ? W[wn + (wm + 8) * ((size_t)LDW)] : 0.0; + s_W[threadIdx.y + 16][threadIdx.x] = + (wm + 16 < M) ? W[wn + (wm + 16) * ((size_t)LDW)] : 0.0; + s_W[threadIdx.y + 24][threadIdx.x] = + (wm + 24 < M) ? W[wn + (wm + 24) * ((size_t)LDW)] : 0.0; + } + + __syncthreads(); + + if(am < M) { + const int aip = LINDXU[am]; + if((an + 0) < N) + A[aip + (an + 0) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 0]; + if((an + 8) < N) + A[aip + (an + 8) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 8]; + if((an + 16) < N) + A[aip + (an + 16) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 16]; + if((an + 24) < N) + A[aip + (an + 24) * ((size_t)LDA)] = s_W[threadIdx.x][threadIdx.y + 24]; + } +} + +void HPL_dlaswp04T(const int M, + const int N, + double* A, + const int LDA, + double* W, + const int LDW, + const int* LINDXU) { + /* + * Purpose + * ======= + * + * HPL_dlaswp04T writes columns of W into rows of A at positions + * indicated by LINDXU. + * + * Arguments + * ========= + * + * M (local input) const int + * On entry, M specifies the number of rows of A that should be + * replaced with columns of W. M must be at least zero. + * + * N (local input) const int + * On entry, N specifies the length of the rows of A that should + * be replaced with columns of W. N must be at least zero. + * + * A (local output) double * + * On entry, A points to an array of dimension (LDA,N). On exit, + * the rows of this array specified by LINDXU are replaced by + * columns of W. + * + * LDA (local input) const int + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least MAX(1,M). + * + * W (local input/output) double * + * On entry, W points to an array of dimension (LDW,*). This + * array contains the columns of W that are to be writen to + * rows of A. + * + * LDW (local input) const int + * On entry, LDW specifies the leading dimension of the array W. + * LDW must be at least MAX(1,N). + * + * LINDXU (local input) const int * + * On entry, LINDXU is an array of dimension M that contains the + * local row indexes of A that should be replaced with W. + * + * --------------------------------------------------------------------- + */ + /* + * .. Local Variables .. + */ + + if((M <= 0) || (N <= 0)) return; + + dim3 grid_size((M + TILE_DIM - 1) / TILE_DIM, (N + TILE_DIM - 1) / TILE_DIM); + dim3 block_size(TILE_DIM, BLOCK_ROWS); + dlaswp04T<<>>( + M, N, A, LDA, W, LDW, LINDXU); + + /* + * End of HPL_dlaswp04T + */ +} diff --git a/src/pauxil/HPL_dlaswp10N_device.cpp b/src/pauxil/HPL_dlaswp10N_device.cpp new file mode 100644 index 0000000..78a8910 --- /dev/null +++ b/src/pauxil/HPL_dlaswp10N_device.cpp @@ -0,0 +1,91 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include + +#define BLOCK_SIZE 512 + +__global__ void dlaswp10N(const int M, + const int N, + double* __restrict__ A, + const int LDA, + const int* __restrict__ IPIV) { + + const int m = threadIdx.x + BLOCK_SIZE * blockIdx.x; + + if(m < M) { + for(int i = 0; i < N; i++) { + const int ip = IPIV[i]; + + if(ip != i) { + // swap + const double Ai = A[m + i * ((size_t)LDA)]; + const double Aip = A[m + ip * ((size_t)LDA)]; + A[m + i * ((size_t)LDA)] = Aip; + A[m + ip * ((size_t)LDA)] = Ai; + } + } + } +} + +void HPL_dlaswp10N(const int M, + const int N, + double* A, + const int LDA, + const int* IPIV) { + /* + * Purpose + * ======= + * + * HPL_dlaswp10N performs a sequence of local column interchanges on a + * matrix A. One column interchange is initiated for columns 0 through + * N-1 of A. + * + * Arguments + * ========= + * + * M (local input) const int + * __arg0__ + * + * N (local input) const int + * On entry, M specifies the number of rows of the array A. M + * must be at least zero. + * + * A (local input/output) double * + * On entry, N specifies the number of columns of the array A. N + * must be at least zero. + * + * LDA (local input) const int + * On entry, A points to an array of dimension (LDA,N). This + * array contains the columns onto which the interchanges should + * be applied. On exit, A contains the permuted matrix. + * + * IPIV (local input) const int * + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least MAX(1,M). + * + * --------------------------------------------------------------------- + */ + + if((M <= 0) || (N <= 0)) return; + + hipStream_t stream; + rocblas_get_stream(handle, &stream); + + dim3 grid_size((M + BLOCK_SIZE - 1) / BLOCK_SIZE); + dlaswp10N<<>>(M, N, A, LDA, IPIV); +} diff --git a/src/pauxil/HPL_indxg2l.cpp b/src/pauxil/HPL_indxg2l.cpp new file mode 100644 index 0000000..4ae5811 --- /dev/null +++ b/src/pauxil/HPL_indxg2l.cpp @@ -0,0 +1,96 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_indxg2l(const int IG, + const int INB, + const int NB, + const int SRCPROC, + const int NPROCS) { + /* + * Purpose + * ======= + * + * HPL_indxg2l computes the local index of a matrix entry pointed to by + * the global index IG. This local returned index is the same in all + * processes. + * + * Arguments + * ========= + * + * IG (input) const int + * On entry, IG specifies the global index of the matrix entry. + * IG must be at least zero. + * + * INB (input) const int + * On entry, INB specifies the size of the first block of the + * global matrix. INB must be at least one. + * + * NB (input) const int + * On entry, NB specifies the blocking factor used to partition + * and distribute the matrix. NB must be larger than one. + * + * SRCPROC (input) const int + * On entry, if SRCPROC = -1, the data is not distributed but + * replicated, in which case this routine returns IG in all + * processes. Otherwise, the value of SRCPROC is ignored. + * + * NPROCS (input) const int + * On entry, NPROCS specifies the total number of process rows + * or columns over which the matrix is distributed. NPROCS must + * be at least one. + * + * --------------------------------------------------------------------- + */ + + int i, j; + + if((IG < INB) || (SRCPROC == -1) || (NPROCS == 1)) + /* + * IG belongs to the first block, or the data is not distributed, or + * there is just one process in this dimension of the grid. + */ + return (IG); + /* + * IG = INB - NB + ( l * NPROCS + MYROC ) * NB + X with 0 <= X < NB, + * thus IG is to be found in the block (IG-INB+NB) / NB = l*NPROCS+MYROC + * with 0 <= MYROC < NPROCS. The local index to be returned depends on + * whether IG resides in the process owning the first partial block of + * size INB (MYROC=0). To determine this cheaply, let i = (IG-INB) / NB, + * so that if NPROCS divides i+1, i.e. MYROC=0, we have i+1 = l*NPROCS. + * If we set j = i / NPROCS, it follows that j = l-1. Therefore, i+1 is + * equal to (j+1) * NPROCS. Conversely, if NPROCS does not divide i+1, + * then i+1 = l*NPROCS + MYROC with 1 <= MYROC < NPROCS. It follows that + * j=l and thus (j+1)*NPROCS > i+1. + */ + j = (i = (IG - INB) / NB) / NPROCS; + /* + * When IG resides in the process owning the first partial block of size + * INB (MYROC = 0), then the result IL can be written as: + * IL = INB - NB + l * NB + X = IG + ( l - (l * NPROCS + MYROC) ) * NB. + * Using the above notation, we have i+1 = l*NPROCS + MYROC = l*NPROCS, + * i.e l = ( i+1 ) / NPROCS = j+1, since NPROCS divides i+1, therefore + * IL = IG + ( j + 1 - ( i + 1 ) ) * NB. + * + * Otherwise when MYROC >= 1, the result IL can be written as: + * IL = l * NB + X = IG - INB + ( ( l+1 ) - ( l * NPROCS + MYROC ) )*NB. + * We still have i+1 = l*NPROCS+MYROC. Since NPROCS does not divide i+1, + * we have j = (l*NPROCS+MYROC-1) / NPROCS = l, i.e + * IL = IG - INB + ( j + 1 - ( i + 1 ) ) * NB. + */ + return (NB * (j - i) + ((i + 1 - (j + 1) * NPROCS) ? IG - INB : IG)); +} diff --git a/src/pauxil/HPL_indxg2lp.cpp b/src/pauxil/HPL_indxg2lp.cpp new file mode 100644 index 0000000..e148f83 --- /dev/null +++ b/src/pauxil/HPL_indxg2lp.cpp @@ -0,0 +1,116 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_indxg2lp(int* IL, + int* PROC, + const int IG, + const int INB, + const int NB, + const int SRCPROC, + const int NPROCS) { + /* + * Purpose + * ======= + * + * HPL_indxg2lp computes the local index of a matrix entry pointed to by + * the global index IG as well as the process coordinate which posseses + * this entry. The local returned index is the same in all processes. + * + * Arguments + * ========= + * + * IL (output) int * + * On exit, IL specifies the local index corresponding to IG. IL + * is at least zero. + * + * PROC (output) int * + * On exit, PROC is the coordinate of the process owning the + * entry specified by the global index IG. PROC is at least zero + * and less than NPROCS. + * + * IG (input) const int + * On entry, IG specifies the global index of the matrix entry. + * IG must be at least zero. + * + * INB (input) const int + * On entry, INB specifies the size of the first block of the + * global matrix. INB must be at least one. + * + * NB (input) const int + * On entry, NB specifies the blocking factor used to partition + * and distribute the matrix A. NB must be larger than one. + * + * SRCPROC (input) const int + * On entry, if SRCPROC = -1, the data is not distributed but + * replicated, in which case this routine returns IG in all + * processes. Otherwise, the value of SRCPROC is ignored. + * + * NPROCS (input) const int + * On entry, NPROCS specifies the total number of process rows + * or columns over which the matrix is distributed. NPROCS must + * be at least one. + * + * --------------------------------------------------------------------- + */ + + int i, j; + + if((IG < INB) || (SRCPROC == -1) || (NPROCS == 1)) { + /* + * IG belongs to the first block, or the data is not distributed, or + * there is just one process in this dimension of the grid. + */ + *IL = IG; + *PROC = SRCPROC; + } else { + /* + * IG = INB - NB + ( l * NPROCS + MYROC ) * NB + X with 0 <= X < NB, + * thus IG is to be found in the block (IG-INB+NB) / NB = l*NPROCS+MYROC + * with 0 <= MYROC < NPROCS. The local index to be returned depends on + * whether IG resides in the process owning the first partial block of + * size INB (MYROC=0). To determine this cheaply, let i = (IG-INB) / NB, + * so that if NPROCS divides i+1, i.e. MYROC=0, we have i+1 = l*NPROCS. + * If we set j = i / NPROCS, it follows that j = l-1. Therefore, i+1 is + * equal to (j+1) * NPROCS. Conversely, if NPROCS does not divide i+1, + * then i+1 = l*NPROCS + MYROC with 1 <= MYROC < NPROCS. It follows that + * j=l and thus (j+1)*NPROCS > i+1. + */ + j = (i = (IG - INB) / NB) / NPROCS; + /* + * IG is in block 1 + ( IG - INB ) / NB. Add this to SRCPROC and take + * the NPROCS modulo (definition of the block-cyclic data distribution). + */ + *PROC = SRCPROC + 1 + i; + *PROC = MPosMod(*PROC, NPROCS); + /* + * When IG resides in the process owning the first partial block of size + * INB (MYROC = 0), then the result IL can be written as: + * IL = INB - NB + l * NB + X = IG + ( l - (l * NPROCS + MYROC) ) * NB. + * Using the above notation, we have i+1 = l*NPROCS + MYROC = l*NPROCS, + * i.e l = ( i+1 ) / NPROCS = j+1, since NPROCS divides i+1, therefore + * IL = IG + ( j + 1 - ( i + 1 ) ) * NB. + * + * Otherwise when MYROC >= 1, the result IL can be written as: + * IL = l * NB + X = IG - INB + ( ( l+1 ) - ( l * NPROCS + MYROC ) )*NB. + * We still have i+1 = l*NPROCS+MYROC. Since NPROCS does not divide i+1, + * we have j = (l*NPROCS+MYROC-1) / NPROCS = l, i.e + * IL = IG - INB + ( j + 1 - ( i + 1 ) ) * NB. + */ + *IL = NB * (j - i) + ((i + 1 - (j + 1) * NPROCS) ? IG - INB : IG); + } +} diff --git a/src/pauxil/HPL_indxg2p.cpp b/src/pauxil/HPL_indxg2p.cpp new file mode 100644 index 0000000..89f4cfd --- /dev/null +++ b/src/pauxil/HPL_indxg2p.cpp @@ -0,0 +1,74 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_indxg2p(const int IG, + const int INB, + const int NB, + const int SRCPROC, + const int NPROCS) { + /* + * Purpose + * ======= + * + * HPL_indxg2p computes the process coordinate which posseses the entry + * of a matrix specified by a global index IG. + * + * Arguments + * ========= + * + * IG (input) const int + * On entry, IG specifies the global index of the matrix entry. + * IG must be at least zero. + * + * INB (input) const int + * On entry, INB specifies the size of the first block of the + * global matrix. INB must be at least one. + * + * NB (input) const int + * On entry, NB specifies the blocking factor used to partition + * and distribute the matrix A. NB must be larger than one. + * + * SRCPROC (input) const int + * On entry, SRCPROC specifies the coordinate of the process + * that possesses the first row or column of the matrix. SRCPROC + * must be at least zero and strictly less than NPROCS. + * + * NPROCS (input) const int + * On entry, NPROCS specifies the total number of process rows + * or columns over which the matrix is distributed. NPROCS must + * be at least one. + * + * --------------------------------------------------------------------- + */ + + int proc; + + if((IG < INB) || (SRCPROC == -1) || (NPROCS == 1)) + /* + * IG belongs to the first block, or the data is not distributed, or + * there is just one process in this dimension of the grid. + */ + return (SRCPROC); + /* + * Otherwise, IG is in block 1 + ( IG - INB ) / NB. Add this to SRCPROC + * and take the NPROCS modulo (definition of the block-cyclic data dis- + * tribution). + */ + proc = SRCPROC + 1 + (IG - INB) / NB; + return (MPosMod(proc, NPROCS)); +} diff --git a/src/pauxil/HPL_indxl2g.cpp b/src/pauxil/HPL_indxl2g.cpp new file mode 100644 index 0000000..3c646c0 --- /dev/null +++ b/src/pauxil/HPL_indxl2g.cpp @@ -0,0 +1,105 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_indxl2g(const int IL, + const int INB, + const int NB, + const int PROC, + const int SRCPROC, + const int NPROCS) { + /* + * Purpose + * ======= + * + * HPL_indxl2g computes the global index of a matrix entry pointed to + * by the local index IL of the process indicated by PROC. + * + * Arguments + * ========= + * + * IL (input) const int + * On entry, IL specifies the local index of the matrix entry. + * IL must be at least zero. + * + * INB (input) const int + * On entry, INB specifies the size of the first block of the + * global matrix. INB must be at least one. + * + * NB (input) const int + * On entry, NB specifies the blocking factor used to partition + * and distribute the matrix A. NB must be larger than one. + * + * PROC (input) const int + * On entry, PROC specifies the coordinate of the process whose + * local array row or column is to be determined. PROC must be + * at least zero and strictly less than NPROCS. + * + * SRCPROC (input) const int + * On entry, SRCPROC specifies the coordinate of the process + * that possesses the first row or column of the matrix. SRCPROC + * must be at least zero and strictly less than NPROCS. + * + * NPROCS (input) const int + * On entry, NPROCS specifies the total number of process rows + * or columns over which the matrix is distributed. NPROCS must + * be at least one. + * + * --------------------------------------------------------------------- + */ + + if((SRCPROC == -1) || (NPROCS == 1)) { + /* + * The data is not distributed, or there is just one process in this di- + * mension of the grid. + */ + return (IL); + } else if(PROC == SRCPROC) { + /* + * If I am SRCPROC, my first block is of size INB + */ + if(IL < INB) + /* + * If IL belongs to the first block, the local and global indexes are + * equal. + */ + return (IL); + /* + * The number of entire blocks before the one IL belongs to is + * ( IL - INB ) / NB + 1. In the other NPROCS-1 processes, there are + * thus NB*( ( IL-INB )/NB + 1 ) entries, that are globally before the + * global entry corresponding to IL. + */ + return ((NPROCS - 1) * NB * ((IL - INB) / NB + 1) + IL); + } else if(PROC < SRCPROC) { + /* + * Otherwise, the process of coordinate MOD(SRCPROC+1, NPROCS) owns the + * second block. Let IPROC = PROC-SRCPROC-1+NPROCS be the number of pro- + * cesses between this process and PROC not included when going from + * left to right on the process line with possible wrap around. These + * IPROC processes have one more NB block than the other processes, who + * own IL / NB blocks of size NB. + */ + return (NB * ((NPROCS - 1) * (IL / NB) + PROC - SRCPROC - 1 + NPROCS) + IL + + INB); + } else { + /* + * Same reasoning as above with IPROC = PROC - SRCPROC - 1. + */ + return (NB * ((NPROCS - 1) * (IL / NB) + PROC - SRCPROC - 1) + IL + INB); + } +} diff --git a/src/pauxil/HPL_infog2l.cpp b/src/pauxil/HPL_infog2l.cpp new file mode 100644 index 0000000..c64e6b1 --- /dev/null +++ b/src/pauxil/HPL_infog2l.cpp @@ -0,0 +1,280 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_infog2l(int I, + int J, + const int IMB, + const int MB, + const int INB, + const int NB, + const int RSRC, + const int CSRC, + const int MYROW, + const int MYCOL, + const int NPROW, + const int NPCOL, + int* II, + int* JJ, + int* PROW, + int* PCOL) { + /* + * Purpose + * ======= + * + * HPL_infog2l computes the starting local index II, JJ corresponding to + * the submatrix starting globally at the entry pointed by I, J. This + * routine returns the coordinates in the grid of the process owning the + * matrix entry of global indexes I, J, namely PROW and PCOL. + * + * Arguments + * ========= + * + * I (global input) int + * On entry, I specifies the global row index of the matrix + * entry. I must be at least zero. + * + * J (global input) int + * On entry, J specifies the global column index of the matrix + * entry. J must be at least zero. + * + * IMB (global input) const int + * On entry, IMB specifies the size of the first row block of + * the global matrix. IMB must be at least one. + * + * MB (global input) const int + * On entry, MB specifies the blocking factor used to partition + * and distribute the rows of the matrix A. MB must be larger + * than one. + * + * INB (global input) const int + * On entry, INB specifies the size of the first column block of + * the global matrix. INB must be at least one. + * + * NB (global input) const int + * On entry, NB specifies the blocking factor used to partition + * and distribute the columns of the matrix A. NB must be larger + * than one. + * + * RSRC (global input) const int + * On entry, RSRC specifies the row coordinate of the process + * that possesses the row I. RSRC must be at least zero and + * strictly less than NPROW. + * + * CSRC (global input) const int + * On entry, CSRC specifies the column coordinate of the process + * that possesses the column J. CSRC must be at least zero and + * strictly less than NPCOL. + * + * MYROW (local input) const int + * On entry, MYROW specifies my row process coordinate in the + * grid. MYROW is greater than or equal to zero and less than + * NPROW. + * + * MYCOL (local input) const int + * On entry, MYCOL specifies my column process coordinate in the + * grid. MYCOL is greater than or equal to zero and less than + * NPCOL. + * + * NPROW (global input) const int + * On entry, NPROW specifies the number of process rows in the + * grid. NPROW is at least one. + * + * NPCOL (global input) const int + * On entry, NPCOL specifies the number of process columns in + * the grid. NPCOL is at least one. + * + * II (local output) int * + * On exit, II specifies the local starting row index of the + * submatrix. On exit, II is at least 0. + * + * JJ (local output) int * + * On exit, JJ specifies the local starting column index of the + * submatrix. On exit, JJ is at least 0. + * + * PROW (global output) int * + * On exit, PROW is the row coordinate of the process owning the + * entry specified by the global index I. PROW is at least zero + * and less than NPROW. + * + * PCOL (global output) int * + * On exit, PCOL is the column coordinate of the process owning + * the entry specified by the global index J. PCOL is at least + * zero and less than NPCOL. + * + * --------------------------------------------------------------------- + */ + + int ilocblk, imb, inb, mb, mydist, nb, nblocks, csrc, rsrc; + + imb = IMB; + *PROW = RSRC; + + if((*PROW == -1) || (NPROW == 1)) { + /* + * The data is not distributed, or there is just one process row in the + * grid. + */ + *II = I; + } else if(I < imb) { + /* + * I refers to an entry in the first block of rows + */ + *II = (MYROW == *PROW ? I : 0); + } else { + mb = MB; + rsrc = *PROW; + /* + * The discussion goes as follows: compute my distance from the source + * process so that within this process coordinate system, the source + * process is the process such that mydist = 0, or equivalently + * MYROW == rsrc. + * + * Find out the global coordinate of the block I belongs to (nblocks), + * as well as the minimum local number of blocks that every process has. + * + * when mydist < nblocks-ilocblk*NPROCS, I own ilocblk + 1 full blocks, + * when mydist > nblocks-ilocblk*NPROCS, I own ilocblk full blocks, + * when mydist = nblocks-ilocblk*NPROCS, I own ilocblk full blocks + * but not I, or I own ilocblk + 1 blocks and the entry I refers to. + */ + if(MYROW == rsrc) { + /* + * I refers to an entry that is not in the first block, find out which + * process has it. + */ + nblocks = (I - imb) / mb + 1; + *PROW += nblocks; + *PROW -= (*PROW / NPROW) * NPROW; + /* + * Since mydist = 0 and nblocks - ilocblk * NPROW >= 0, there are only + * three possible cases: + * + * 1) When 0 = mydist = nblocks - ilocblk * NPROW = 0 and I do not own + * I, in which case II = IMB + ( ilocblk - 1 ) * MB. Note that this + * case cannot happen when ilocblk is zero, since nblocks is at + * least one. + * + * 2) When 0 = mydist = nblocks - ilocblk * NPROW = 0 and I own I, in + * which case I and II can respectively be written as IMB + + * (nblocks-1)*NB + IL and IMB + (ilocblk-1) * MB + IL. That is + * II = I + (ilocblk-nblocks)*MB. Note that this case cannot happen + * when ilocblk is zero, since nblocks is at least one. + * + * 3) mydist = 0 < nblocks - ilocblk * NPROW, the source process owns + * ilocblk+1 full blocks, and therefore II = IMB + ilocblk * MB. + * Note that when ilocblk is zero, II is just IMB. + */ + if(nblocks < NPROW) { + *II = imb; + } else { + ilocblk = nblocks / NPROW; + if(ilocblk * NPROW >= nblocks) { + *II = ((MYROW == *PROW) ? I + (ilocblk - nblocks) * mb + : imb + (ilocblk - 1) * mb); + } else { + *II = imb + ilocblk * mb; + } + } + } else { + /* + * I refers to an entry that is not in the first block, find out which + * process has it. + */ + nblocks = (I -= imb) / mb + 1; + *PROW += nblocks; + *PROW -= (*PROW / NPROW) * NPROW; + /* + * Compute my distance from the source process so that within this pro- + * cess coordinate system, the source process is the process such that + * mydist=0. + */ + if((mydist = MYROW - rsrc) < 0) mydist += NPROW; + /* + * When mydist < nblocks - ilocblk * NPROW, I own ilocblk+1 full blocks + * of size MB since I am not the source process, i.e. II=(ilocblk+1)*MB. + * When mydist>=nblocks-ilocblk*NPROW and I do not own I, I own ilocblk + * full blocks of size MB, i.e. II = ilocblk*MB, otherwise I own ilocblk + * blocks and I, in which case I can be written as IMB + (nblocks-1)*MB + * + IL and II = ilocblk*MB + IL = I - IMB + (ilocblk - nblocks + 1)*MB. + */ + if(nblocks < NPROW) { + mydist -= nblocks; + *II = ((mydist < 0) ? mb + : ((MYROW == *PROW) ? I + (1 - nblocks) * mb : 0)); + } else { + ilocblk = nblocks / NPROW; + mydist -= nblocks - ilocblk * NPROW; + *II = + ((mydist < 0) ? (ilocblk + 1) * mb + : ((MYROW == *PROW) ? (ilocblk - nblocks + 1) * mb + I + : ilocblk * mb)); + } + } + } + /* + * Idem for the columns + */ + inb = INB; + *PCOL = CSRC; + + if((*PCOL == -1) || (NPCOL == 1)) { + *JJ = J; + } else if(J < inb) { + *JJ = (MYCOL == *PCOL ? J : 0); + } else { + nb = NB; + csrc = *PCOL; + + if(MYCOL == csrc) { + nblocks = (J - inb) / nb + 1; + *PCOL += nblocks; + *PCOL -= (*PCOL / NPCOL) * NPCOL; + + if(nblocks < NPCOL) { + *JJ = inb; + } else { + ilocblk = nblocks / NPCOL; + if(ilocblk * NPCOL >= nblocks) { + *JJ = ((MYCOL == *PCOL) ? J + (ilocblk - nblocks) * nb + : inb + (ilocblk - 1) * nb); + } else { + *JJ = inb + ilocblk * nb; + } + } + } else { + nblocks = (J -= inb) / nb + 1; + *PCOL += nblocks; + *PCOL -= (*PCOL / NPCOL) * NPCOL; + + if((mydist = MYCOL - csrc) < 0) mydist += NPCOL; + + if(nblocks < NPCOL) { + mydist -= nblocks; + *JJ = ((mydist < 0) ? nb + : ((MYCOL == *PCOL) ? J + (1 - nblocks) * nb : 0)); + } else { + ilocblk = nblocks / NPCOL; + mydist -= nblocks - ilocblk * NPCOL; + *JJ = + ((mydist < 0) ? (ilocblk + 1) * nb + : ((MYCOL == *PCOL) ? (ilocblk - nblocks + 1) * nb + J + : ilocblk * nb)); + } + } + } +} diff --git a/src/pauxil/HPL_numroc.cpp b/src/pauxil/HPL_numroc.cpp new file mode 100644 index 0000000..95b96ce --- /dev/null +++ b/src/pauxil/HPL_numroc.cpp @@ -0,0 +1,67 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_numroc(const int N, + const int INB, + const int NB, + const int PROC, + const int SRCPROC, + const int NPROCS) { + /* + * Purpose + * ======= + * + * HPL_numroc returns the local number of matrix rows/columns process + * PROC will get if we give out N rows/columns starting from global + * index 0. + * + * Arguments + * ========= + * + * N (input) const int + * On entry, N specifies the number of rows/columns being dealt + * out. N must be at least zero. + * + * INB (input) const int + * On entry, INB specifies the size of the first block of the + * global matrix. INB must be at least one. + * + * NB (input) const int + * On entry, NB specifies the blocking factor used to partition + * and distribute the matrix A. NB must be larger than one. + * + * PROC (input) const int + * On entry, PROC specifies the coordinate of the process whose + * local portion is determined. PROC must be at least zero and + * strictly less than NPROCS. + * + * SRCPROC (input) const int + * On entry, SRCPROC specifies the coordinate of the process + * that possesses the first row or column of the matrix. SRCPROC + * must be at least zero and strictly less than NPROCS. + * + * NPROCS (input) const int + * On entry, NPROCS specifies the total number of process rows + * or columns over which the matrix is distributed. NPROCS must + * be at least one. + * + * --------------------------------------------------------------------- + */ + + return (HPL_numrocI(N, 0, INB, NB, PROC, SRCPROC, NPROCS)); +} diff --git a/src/pauxil/HPL_numrocI.cpp b/src/pauxil/HPL_numrocI.cpp new file mode 100644 index 0000000..7e22f5d --- /dev/null +++ b/src/pauxil/HPL_numrocI.cpp @@ -0,0 +1,185 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +int HPL_numrocI(const int N, + const int I, + const int INB, + const int NB, + const int PROC, + const int SRCPROC, + const int NPROCS) { + /* + * Purpose + * ======= + * + * HPL_numrocI returns the local number of matrix rows/columns process + * PROC will get if we give out N rows/columns starting from global + * index I. + * + * Arguments + * ========= + * + * N (input) const int + * On entry, N specifies the number of rows/columns being dealt + * out. N must be at least zero. + * + * I (input) const int + * On entry, I specifies the global index of the matrix entry + * I must be at least zero. + * + * INB (input) const int + * On entry, INB specifies the size of the first block of th + * global matrix. INB must be at least one. + * + * NB (input) const int + * On entry, NB specifies the blocking factor used to partition + * and distribute the matrix A. NB must be larger than one. + * + * PROC (input) const int + * On entry, PROC specifies the coordinate of the process whos + * local portion is determined. PROC must be at least zero an + * strictly less than NPROCS. + * + * SRCPROC (input) const int + * On entry, SRCPROC specifies the coordinate of the proces + * that possesses the first row or column of the matrix. SRCPRO + * must be at least zero and strictly less than NPROCS. + * + * NPROCS (input) const int + * On entry, NPROCS specifies the total number of process row + * or columns over which the matrix is distributed. NPROCS mus + * be at least one. + * + * --------------------------------------------------------------------- + */ + + int ilocblk, inb, mydist, nblocks, srcproc; + + if((SRCPROC == -1) || (NPROCS == 1)) + /* + * The data is not distributed, or there is just one process in this di- + * mension of the grid. + */ + return (N); + /* + * Compute coordinate of process owning I and corresponding INB + */ + srcproc = SRCPROC; + + if((inb = INB - I) <= 0) { + /* + * I is not in the first block, find out which process has it and update + * the size of first block + */ + srcproc += (nblocks = (-inb) / NB + 1); + srcproc -= (srcproc / NPROCS) * NPROCS; + inb += nblocks * NB; + } + /* + * Now everything is just like N, I=0, INB, NB, srcproc, NPROCS. The + * discussion goes as follows: compute my distance from the source pro- + * cess so that within this process coordinate system, the source pro- + * cess is the process such that mydist = 0, or PROC == srcproc. + * + * Find out how many full blocks are globally (nblocks) and locally + * (ilocblk) in those N entries. Then remark that + * + * when mydist < nblocks - ilocblk*NPROCS, I own ilocblk+1 full blocks, + * when mydist > nblocks - ilocblk*NPROCS, I own ilocblk full blocks, + * when mydist = nblocks - ilocblk*NPROCS, either the last block is not + * full and I own it, or the last block is full and I am the first pro- + * cess owning only ilocblk full blocks. + */ + if(PROC == srcproc) { + /* + * I am the source process, i.e. I own I (mydist=0). When N <= INB, the + * answer is simply N. + */ + if(N <= inb) return (N); + /* + * Find out how many full blocks are globally (nblocks) and locally + * (ilocblk) in those N entries. + */ + nblocks = (N - inb) / NB + 1; + /* + * Since mydist = 0 and nblocks - ilocblk * NPROCS >= 0, there are only + * two possible cases: + * + * 1) When mydist = nblocks - ilocblk * NPROCS = 0, that is NPROCS di- + * vides the global number of full blocks, then the source process + * srcproc owns one more block than the other processes; and N can + * be rewritten as N = INB + (nblocks-1) * NB + LNB with LNB >= 0 + * size of the last block. Similarly, the local value Np correspon- + * ding to N can be written as Np = INB + (ilocblk-1) * NB + LNB = + * N + ( ilocblk-1 - (nblocks-1) )*NB. Note that this case cannot + * happen when ilocblk is zero, since nblocks is at least one. + * + * 2) mydist = 0 < nblocks - ilocblk * NPROCS, the source process only + * owns full blocks, and therefore Np = INB + ilocblk * NB. Note + * that when ilocblk is zero, Np is just INB. + */ + if(nblocks < NPROCS) return (inb); + + ilocblk = nblocks / NPROCS; + return ((nblocks - ilocblk * NPROCS) ? inb + ilocblk * NB + : N + (ilocblk - nblocks) * NB); + } else { + /* + * I am not the source process. When N <= INB, the answer is simply 0. + */ + if(N <= inb) return (0); + /* + * Find out how many full blocks are globally (nblocks) and locally + * (ilocblk) in those N entries + */ + nblocks = (N - inb) / NB + 1; + /* + * Compute my distance from the source process so that within this pro- + * cess coordinate system, the source process is the process such that + * mydist=0. + */ + if((mydist = PROC - srcproc) < 0) mydist += NPROCS; + /* + * When mydist < nblocks - ilocblk*NPROCS, I own ilocblk + 1 full blocks + * of size NB since I am not the source process, + * + * when mydist > nblocks - ilocblk * NPROCS, I own ilocblk full blocks + * of size NB since I am not the source process, + * + * when mydist = nblocks - ilocblk*NPROCS, + * either the last block is not full and I own it, in which case + * N = INB + (nblocks - 1)*NB + LNB with LNB the size of the last + * block such that NB > LNB > 0; the local value Np corresponding to + * N is given by Np = ilocblk*NB+LNB = N-INB+(ilocblk-nblocks+1)*NB; + * or the last block is full and I am the first process owning only + * ilocblk full blocks of size NB, that is N = INB+(nblocks-1)*NB and + * Np = ilocblk * NB = N - INB + (ilocblk-nblocks+1) * NB. + */ + if(nblocks < NPROCS) + return ((mydist < nblocks) + ? NB + : ((mydist > nblocks) ? 0 : N - inb + NB * (1 - nblocks))); + + ilocblk = nblocks / NPROCS; + mydist -= nblocks - ilocblk * NPROCS; + return ((mydist < 0) + ? (ilocblk + 1) * NB + : ((mydist > 0) ? ilocblk * NB + : N - inb + NB * (ilocblk - nblocks + 1))); + } +} diff --git a/src/pauxil/HPL_pabort.cpp b/src/pauxil/HPL_pabort.cpp new file mode 100644 index 0000000..0a89a85 --- /dev/null +++ b/src/pauxil/HPL_pabort.cpp @@ -0,0 +1,85 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pabort(int LINE, const char* SRNAME, const char* FORM, ...) { + /* + * Purpose + * ======= + * + * HPL_pabort displays an error message on stderr and halts execution. + * + * + * Arguments + * ========= + * + * LINE (local input) int + * On entry, LINE specifies the line number in the file where + * the error has occured. When LINE is not a positive line + * number, it is ignored. + * + * SRNAME (local input) const char * + * On entry, SRNAME should be the name of the routine calling + * this error handler. + * + * FORM (local input) const char * + * On entry, FORM specifies the format, i.e., how the subsequent + * arguments are converted for output. + * + * (local input) ... + * On entry, ... is the list of arguments to be printed within + * the format string. + * + * --------------------------------------------------------------------- + */ + + va_list argptr; + int rank; + char cline[128]; + + va_start(argptr, FORM); + (void)vsprintf(cline, FORM, argptr); + va_end(argptr); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + /* + * Display an error message + */ + if(LINE <= 0) + HPL_fprintf(stderr, + "%s %s %d, %s %s:\n>>> %s <<< Abort ...\n\n", + "HPL ERROR", + "from process #", + rank, + "in function", + SRNAME, + cline); + else + HPL_fprintf(stderr, + "%s %s %d, %s %d %s %s:\n>>> %s <<< Abort ...\n\n", + "HPL ERROR", + "from process #", + rank, + "on line", + LINE, + "of function", + SRNAME, + cline); + + MPI_Abort(MPI_COMM_WORLD, -1); + exit(-1); +} diff --git a/src/pauxil/HPL_pdlamch.cpp b/src/pauxil/HPL_pdlamch.cpp new file mode 100644 index 0000000..e6fb8e8 --- /dev/null +++ b/src/pauxil/HPL_pdlamch.cpp @@ -0,0 +1,87 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +double HPL_pdlamch(MPI_Comm COMM, const HPL_T_MACH CMACH) { + /* + * Purpose + * ======= + * + * HPL_pdlamch determines machine-specific arithmetic constants such as + * the relative machine precision (eps), the safe minimum(sfmin) such that + * 1/sfmin does not overflow, the base of the machine (base), the precision + * (prec), the number of (base) digits in the mantissa (t), whether + * rounding occurs in addition (rnd = 1.0 and 0.0 otherwise), the minimum + * exponent before (gradual) underflow (emin), the underflow threshold + * (rmin)- base**(emin-1), the largest exponent before overflow (emax), the + * overflow threshold (rmax) - (base**emax)*(1-eps). + * + * Arguments + * ========= + * + * COMM (global/local input) MPI_Comm + * The MPI communicator identifying the process collection. + * + * CMACH (global input) const HPL_T_MACH + * Specifies the value to be returned by HPL_pdlamch + * = HPL_MACH_EPS, HPL_pdlamch := eps (default) + * = HPL_MACH_SFMIN, HPL_pdlamch := sfmin + * = HPL_MACH_BASE, HPL_pdlamch := base + * = HPL_MACH_PREC, HPL_pdlamch := eps*base + * = HPL_MACH_MLEN, HPL_pdlamch := t + * = HPL_MACH_RND, HPL_pdlamch := rnd + * = HPL_MACH_EMIN, HPL_pdlamch := emin + * = HPL_MACH_RMIN, HPL_pdlamch := rmin + * = HPL_MACH_EMAX, HPL_pdlamch := emax + * = HPL_MACH_RMAX, HPL_pdlamch := rmax + * + * where + * + * eps = relative machine precision, + * sfmin = safe minimum, + * base = base of the machine, + * prec = eps*base, + * t = number of digits in the mantissa, + * rnd = 1.0 if rounding occurs in addition, + * emin = minimum exponent before underflow, + * rmin = underflow threshold, + * emax = largest exponent before overflow, + * rmax = overflow threshold. + * + * --------------------------------------------------------------------- + */ + + double param; + + param = HPL_dlamch(CMACH); + + switch(CMACH) { + case HPL_MACH_EPS: + case HPL_MACH_SFMIN: + case HPL_MACH_EMIN: + case HPL_MACH_RMIN: + (void)HPL_all_reduce((void*)(¶m), 1, HPL_DOUBLE, HPL_MAX, COMM); + break; + case HPL_MACH_EMAX: + case HPL_MACH_RMAX: + (void)HPL_all_reduce((void*)(¶m), 1, HPL_DOUBLE, HPL_MIN, COMM); + break; + default: break; + } + + return (param); +} diff --git a/src/pauxil/HPL_pdlange_device.cpp b/src/pauxil/HPL_pdlange_device.cpp new file mode 100644 index 0000000..bf0fb1f --- /dev/null +++ b/src/pauxil/HPL_pdlange_device.cpp @@ -0,0 +1,302 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include + +#define BLOCK_SIZE 512 +#define GRID_SIZE 512 + +__global__ void normA_1(const int N, + const int M, + const double* __restrict__ A, + const int LDA, + double* __restrict__ normAtmp) { + __shared__ double s_norm[BLOCK_SIZE]; + + const int t = threadIdx.x; + const int i = blockIdx.x; + size_t id = i * BLOCK_SIZE + t; + + s_norm[t] = 0.0; + for(; id < (size_t)N * M; id += gridDim.x * BLOCK_SIZE) { + const int m = id % M; + const int n = id / M; + const double Anm = fabs(A[n + ((size_t)m) * LDA]); + + s_norm[t] = (Anm > s_norm[t]) ? Anm : s_norm[t]; + } + __syncthreads(); + + for(int k = BLOCK_SIZE / 2; k > 0; k /= 2) { + if(t < k) { + s_norm[t] = (s_norm[t + k] > s_norm[t]) ? s_norm[t + k] : s_norm[t]; + } + __syncthreads(); + } + + if(t == 0) normAtmp[i] = s_norm[0]; +} + +__global__ void normA_2(const int N, double* __restrict__ normAtmp) { + __shared__ double s_norm[BLOCK_SIZE]; + + const int t = threadIdx.x; + + s_norm[t] = 0.0; + for(size_t id = t; id < N; id += BLOCK_SIZE) { + const double Anm = normAtmp[id]; + s_norm[t] = (Anm > s_norm[t]) ? Anm : s_norm[t]; + } + __syncthreads(); + + for(int k = BLOCK_SIZE / 2; k > 0; k /= 2) { + if(t < k) { + s_norm[t] = (s_norm[t + k] > s_norm[t]) ? s_norm[t + k] : s_norm[t]; + } + __syncthreads(); + } + + if(t == 0) normAtmp[0] = s_norm[0]; +} + +__global__ void norm1(const int N, + const int M, + const double* __restrict__ A, + const int LDA, + double* __restrict__ work) { + + __shared__ double s_norm1[BLOCK_SIZE]; + + const int t = threadIdx.x; + const int n = blockIdx.x; + + s_norm1[t] = 0.0; + for(size_t id = t; id < M; id += BLOCK_SIZE) { + s_norm1[t] += fabs(A[id + n * ((size_t)LDA)]); + } + + __syncthreads(); + + for(int k = BLOCK_SIZE / 2; k > 0; k /= 2) { + if(t < k) { s_norm1[t] += s_norm1[t + k]; } + __syncthreads(); + } + + if(t == 0) work[n] = s_norm1[0]; +} + +__global__ void norminf(const int N, + const int M, + const double* __restrict__ A, + const int LDA, + double* __restrict__ work) { + const int t = threadIdx.x; + const int b = blockIdx.x; + const size_t id = b * BLOCK_SIZE + t; // row id + + if(id < M) { + double norm = 0.0; + for(size_t i = 0; i < N; i++) { norm += fabs(A[id + i * ((size_t)LDA)]); } + work[id] = norm; + } +} + +double HPL_pdlange(const HPL_T_grid* GRID, + const HPL_T_NORM NORM, + const int M, + const int N, + const int NB, + const double* A, + const int LDA) { + /* + * Purpose + * ======= + * + * HPL_pdlange returns the value of the one norm, or the infinity norm, + * or the element of largest absolute value of a distributed matrix A: + * + * + * max(abs(A(i,j))) when NORM = HPL_NORM_A, + * norm1(A), when NORM = HPL_NORM_1, + * normI(A), when NORM = HPL_NORM_I, + * + * where norm1 denotes the one norm of a matrix (maximum column sum) and + * normI denotes the infinity norm of a matrix (maximum row sum). Note + * that max(abs(A(i,j))) is not a matrix norm. + * + * Arguments + * ========= + * + * GRID (local input) const HPL_T_grid * + * On entry, GRID points to the data structure containing the + * process grid information. + * + * NORM (global input) const HPL_T_NORM + * On entry, NORM specifies the value to be returned by this + * function as described above. + * + * M (global input) const int + * On entry, M specifies the number of rows of the matrix A. + * M must be at least zero. + * + * N (global input) const int + * On entry, N specifies the number of columns of the matrix A. + * N must be at least zero. + * + * NB (global input) const int + * On entry, NB specifies the blocking factor used to partition + * and distribute the matrix. NB must be larger than one. + * + * A (local input) const double * + * On entry, A points to an array of dimension (LDA,LocQ(N)), + * that contains the local pieces of the distributed matrix A. + * + * LDA (local input) const int + * On entry, LDA specifies the leading dimension of the array A. + * LDA must be at least max(1,LocP(M)). + * + * --------------------------------------------------------------------- + */ + + double s, v0 = HPL_rzero, *work = NULL, *dwork = NULL; + MPI_Comm Acomm, Ccomm, Rcomm; + int ii, jj, mp, mycol, myrow, npcol, nprow, nq; + + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + Rcomm = GRID->row_comm; + Ccomm = GRID->col_comm; + Acomm = GRID->all_comm; + + Mnumroc(mp, M, NB, NB, myrow, 0, nprow); + Mnumroc(nq, N, NB, NB, mycol, 0, npcol); + + if(Mmin(M, N) == 0) { + return (v0); + } else if(NORM == HPL_NORM_A) { + /* + * max( abs( A ) ) + */ + if((nq > 0) && (mp > 0)) { + if(nq == 1) { // column vector + int id; + rocblas_idamax(handle, mp, A, 1, &id); + hipMemcpy(&v0, A + id - 1, 1 * sizeof(double), hipMemcpyDeviceToHost); + } else if(mp == 1) { // row vector + int id; + rocblas_idamax(handle, nq, A, LDA, &id); + hipMemcpy(&v0, + A + ((size_t)id * LDA), + 1 * sizeof(double), + hipMemcpyDeviceToHost); + } else { + // custom reduction kernels + hipMalloc(&dwork, GRID_SIZE * sizeof(double)); + + size_t grid_size = (nq * mp + BLOCK_SIZE - 1) / BLOCK_SIZE; + grid_size = (grid_size < GRID_SIZE) ? grid_size : GRID_SIZE; + + normA_1<<>>(nq, mp, A, LDA, dwork); + normA_2<<<1, BLOCK_SIZE>>>(grid_size, dwork); + + hipMemcpy(&v0, dwork, 1 * sizeof(double), hipMemcpyDeviceToHost); + hipFree(dwork); + } + } + (void)HPL_reduce((void*)(&v0), 1, HPL_DOUBLE, HPL_MAX, 0, Acomm); + } else if(NORM == HPL_NORM_1) { + /* + * Find norm_1( A ). + */ + if(nq > 0) { + work = (double*)malloc((size_t)(nq) * sizeof(double)); + if(work == NULL) { + HPL_pabort(__LINE__, "HPL_pdlange", "Memory allocation failed"); + } + + if(nq == 1) { // column vector + rocblas_dasum(handle, mp, A, 1, work); + } else { + hipMalloc(&dwork, nq * sizeof(double)); + norm1<<>>(nq, mp, A, LDA, dwork); + hipMemcpy(work, dwork, nq * sizeof(double), hipMemcpyDeviceToHost); + } + /* + * Find sum of global matrix columns, store on row 0 of process grid + */ + (void)HPL_reduce((void*)(work), nq, HPL_DOUBLE, HPL_SUM, 0, Ccomm); + /* + * Find maximum sum of columns for 1-norm + */ + if(myrow == 0) { + v0 = work[HPL_idamax(nq, work, 1)]; + v0 = Mabs(v0); + } + if(work) free(work); + if(dwork) hipFree(dwork); + } + /* + * Find max in row 0, store result in process (0,0) + */ + if(myrow == 0) + (void)HPL_reduce((void*)(&v0), 1, HPL_DOUBLE, HPL_MAX, 0, Rcomm); + } else if(NORM == HPL_NORM_I) { + /* + * Find norm_inf( A ) + */ + if(mp > 0) { + work = (double*)malloc((size_t)(mp) * sizeof(double)); + if(work == NULL) { + HPL_pabort(__LINE__, "HPL_pdlange", "Memory allocation failed"); + } + + if(mp == 1) { // row vector + rocblas_dasum(handle, nq, A, LDA, work); + } else { + hipMalloc(&dwork, mp * sizeof(double)); + + size_t grid_size = (mp + BLOCK_SIZE - 1) / BLOCK_SIZE; + norminf<<>>(nq, mp, A, LDA, dwork); + hipMemcpy(work, dwork, mp * sizeof(double), hipMemcpyDeviceToHost); + } + + /* + * Find sum of global matrix rows, store on column 0 of process grid + */ + (void)HPL_reduce((void*)(work), mp, HPL_DOUBLE, HPL_SUM, 0, Rcomm); + /* + * Find maximum sum of rows for inf-norm + */ + if(mycol == 0) { + v0 = work[HPL_idamax(mp, work, 1)]; + v0 = Mabs(v0); + } + if(work) free(work); + if(dwork) hipFree(dwork); + } + /* + * Find max in column 0, store result in process (0,0) + */ + if(mycol == 0) + (void)HPL_reduce((void*)(&v0), 1, HPL_DOUBLE, HPL_MAX, 0, Ccomm); + } + /* + * Broadcast answer to every process in the grid + */ + (void)HPL_broadcast((void*)(&v0), 1, HPL_DOUBLE, 0, Acomm); + + return (v0); +} diff --git a/src/pauxil/HPL_pwarn.cpp b/src/pauxil/HPL_pwarn.cpp new file mode 100644 index 0000000..e11b4bb --- /dev/null +++ b/src/pauxil/HPL_pwarn.cpp @@ -0,0 +1,89 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pwarn(FILE* STREAM, + int LINE, + const char* SRNAME, + const char* FORM, + ...) { + /* + * Purpose + * ======= + * + * HPL_pwarn displays an error message. + * + * + * Arguments + * ========= + * + * STREAM (local input) FILE * + * On entry, STREAM specifies the output stream. + * + * LINE (local input) int + * On entry, LINE specifies the line number in the file where + * the error has occured. When LINE is not a positive line + * number, it is ignored. + * + * SRNAME (local input) const char * + * On entry, SRNAME should be the name of the routine calling + * this error handler. + * + * FORM (local input) const char * + * On entry, FORM specifies the format, i.e., how the subsequent + * arguments are converted for output. + * + * (local input) ... + * On entry, ... is the list of arguments to be printed within + * the format string. + * + * --------------------------------------------------------------------- + */ + + va_list argptr; + int rank; + char cline[128]; + + va_start(argptr, FORM); + (void)vsprintf(cline, FORM, argptr); + va_end(argptr); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + /* + * Display an error message + */ + if(LINE <= 0) + HPL_fprintf(STREAM, + "%s %s %d, %s %s:\n>>> %s <<<\n\n", + "HPL ERROR", + "from process #", + rank, + "in function", + SRNAME, + cline); + else + HPL_fprintf(STREAM, + "%s %s %d, %s %d %s %s:\n>>> %s <<<\n\n", + "HPL ERROR", + "from process #", + rank, + "on line", + LINE, + "of function", + SRNAME, + cline); +} diff --git a/src/pfact/HPL_dlocmax.cpp b/src/pfact/HPL_dlocmax.cpp new file mode 100644 index 0000000..de12400 --- /dev/null +++ b/src/pfact/HPL_dlocmax.cpp @@ -0,0 +1,110 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_dlocmax(HPL_T_panel* PANEL, + const int N, + const int II, + const int JJ, + double* WORK, + int thread_rank, + int thread_size, + int* max_index, + double* max_value) { + /* + * Purpose + * ======= + * + * HPL_dlocmax finds the maximum entry in the current column and packs + * the useful information in WORK[0:3]. On exit, WORK[0] contains the + * local maximum absolute value scalar, WORK[1] is the corresponding + * local row index, WORK[2] is the corresponding global row index, and + * WORK[3] is the coordinate of the process owning this max. When N is + * less than 1, the WORK[0:2] is initialized to zero, and WORK[3] is set + * to the total number of process rows. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * N (local input) const int + * On entry, N specifies the local number of rows of the column + * of A on which we operate. + * + * II (local input) const int + * On entry, II specifies the row offset where the column to be + * operated on starts with respect to the panel. + * + * JJ (local input) const int + * On entry, JJ specifies the column offset where the column to + * be operated on starts with respect to the panel. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 4. On exit, + * WORK[0] contains the local maximum absolute value scalar, + * WORK[1] contains the corresponding local row index, WORK[2] + * contains the corresponding global row index, and WORK[3] is + * the coordinate of process owning this max. + * + * --------------------------------------------------------------------- + */ + + double* A; + int kk, igindx, ilindx, myrow, nb, nprow; + + if(N > 0) { + A = Mptr(PANEL->A, II, JJ, PANEL->lda); + myrow = PANEL->grid->myrow; + nprow = PANEL->grid->nprow; + nb = PANEL->nb; + + HPL_idamax_omp( + N, A, 1, nb, II, thread_rank, thread_size, max_index, max_value); + + if(thread_rank == 0) { + ilindx = max_index[0]; + kk = PANEL->ii + II + (ilindx); + Mindxl2g(igindx, kk, nb, nb, myrow, 0, nprow); + /* + * WORK[0] := local maximum absolute value scalar, + * WORK[1] := corresponding local row index, + * WORK[2] := corresponding global row index, + * WORK[3] := coordinate of process owning this max. + */ + WORK[0] = max_value[0]; + WORK[1] = (double)(ilindx); + WORK[2] = (double)(igindx); + WORK[3] = (double)(myrow); + } + } else { + /* + * If I do not have any row of A, then set the coordinate of the process + * (WORK[3]) owning this "ghost" row, such that it will never be used, + * even if there are only zeros in the current column of A. + */ + if(thread_rank == 0) { + WORK[0] = WORK[1] = WORK[2] = HPL_rzero; + WORK[3] = (double)(PANEL->grid->nprow); + } + } + +// make sure WORK is visible to all threads +#pragma omp barrier +} diff --git a/src/pfact/HPL_dlocswpN.cpp b/src/pfact/HPL_dlocswpN.cpp new file mode 100644 index 0000000..51a0f8a --- /dev/null +++ b/src/pfact/HPL_dlocswpN.cpp @@ -0,0 +1,150 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_dlocswpN(HPL_T_panel* PANEL, + const int II, + const int JJ, + double* WORK) { + /* + * Purpose + * ======= + * + * HPL_dlocswpN performs the local swapping operations within a panel. + * The lower triangular N0-by-N0 upper block of the panel is stored in + * no-transpose form (i.e. just like the input matrix itself). + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * II (local input) const int + * On entry, II specifies the row offset where the column to be + * operated on starts with respect to the panel. + * + * JJ (local input) const int + * On entry, JJ specifies the column offset where the column to + * be operated on starts with respect to the panel. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2 * (4+2*N0). + * WORK[0] contains the local maximum absolute value scalar, + * WORK[1] contains the corresponding local row index, WORK[2] + * contains the corresponding global row index, and WORK[3] is + * the coordinate of process owning this max. The N0 length max + * row is stored in WORK[4:4+N0-1]; Note that this is also the + * JJth row (or column) of L1. The remaining part of this array + * is used as workspace. + * + * --------------------------------------------------------------------- + */ + + double gmax; + double *A1, *A2, *L, *Wr0, *Wmx; + int ilindx, lda, myrow, n0; + + myrow = PANEL->grid->myrow; + n0 = PANEL->jb; + int NB = PANEL->nb; + lda = PANEL->lda; + + Wr0 = (Wmx = WORK + 4) + NB; + Wmx[JJ] = gmax = WORK[0]; + + /* + * Replicated swap and copy of the current (new) row of A into L1 + */ + L = Mptr(PANEL->L1, JJ, 0, n0); + /* + * If the pivot is non-zero ... + */ + if(gmax != HPL_rzero) { + /* + * and if I own the current row of A ... + */ + if(myrow == PANEL->prow) { + /* + * and if I also own the row to be swapped with the current row of A ... + */ + if(myrow == (int)(WORK[3])) { + /* + * and if the current row of A is not to swapped with itself ... + */ + if((ilindx = (int)(WORK[1])) != 0) { + /* + * then copy the max row into L1 and locally swap the 2 rows of A. + */ + A1 = Mptr(PANEL->A, II, 0, lda); + A2 = Mptr(A1, ilindx, 0, lda); + + HPL_dcopy(n0, Wmx, 1, L, n0); + HPL_dcopy(n0, Wmx, 1, A1, lda); + HPL_dcopy(n0, Wr0, 1, A2, lda); + + } else { + /* + * otherwise the current row of A is swapped with itself, so just + * copy the current of A into L1. + */ + *Mptr(PANEL->A, II, JJ, lda) = gmax; + + HPL_dcopy(n0, Wmx, 1, L, n0); + } + + } else { + /* + * otherwise, the row to be swapped with the current row of A is in Wmx, + * so copy Wmx into L1 and A. + */ + A1 = Mptr(PANEL->A, II, 0, lda); + + HPL_dcopy(n0, Wmx, 1, L, n0); + HPL_dcopy(n0, Wmx, 1, A1, lda); + } + + } else { + /* + * otherwise I do not own the current row of A, so copy the max row Wmx + * into L1. + */ + HPL_dcopy(n0, Wmx, 1, L, n0); + + /* + * and if I own the max row, overwrite it with the current row Wr0. + */ + if(myrow == (int)(WORK[3])) { + A2 = Mptr(PANEL->A, II + (size_t)(WORK[1]), 0, lda); + + HPL_dcopy(n0, Wr0, 1, A2, lda); + } + } + } else { + /* + * Otherwise the max element in the current column is zero, simply copy + * the current row Wr0 into L1. The matrix is singular. + */ + HPL_dcopy(n0, Wr0, 1, L, n0); + + /* + * set INFO. + */ + if(*(PANEL->DINFO) == 0.0) *(PANEL->DINFO) = (double)(PANEL->ia + JJ + 1); + } +} diff --git a/src/pfact/HPL_dlocswpT.cpp b/src/pfact/HPL_dlocswpT.cpp new file mode 100644 index 0000000..eca4e4f --- /dev/null +++ b/src/pfact/HPL_dlocswpT.cpp @@ -0,0 +1,150 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_dlocswpT(HPL_T_panel* PANEL, + const int II, + const int JJ, + double* WORK) { + /* + * Purpose + * ======= + * + * HPL_dlocswpT performs the local swapping operations within a panel. + * The lower triangular N0-by-N0 upper block of the panel is stored in + * transpose form. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * II (local input) const int + * On entry, II specifies the row offset where the column to be + * operated on starts with respect to the panel. + * + * JJ (local input) const int + * On entry, JJ specifies the column offset where the column to + * be operated on starts with respect to the panel. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2 * (4+2*N0). + * WORK[0] contains the local maximum absolute value scalar, + * WORK[1] contains the corresponding local row index, WORK[2] + * contains the corresponding global row index, and WORK[3] is + * the coordinate of process owning this max. The N0 length max + * row is stored in WORK[4:4+N0-1]; Note that this is also the + * JJth row (or column) of L1. The remaining part of this array + * is used as workspace. + * + * --------------------------------------------------------------------- + */ + + double gmax; + double *A1, *A2, *L, *Wr0, *Wmx; + int ilindx, lda, myrow, n0; + + myrow = PANEL->grid->myrow; + n0 = PANEL->jb; + int NB = PANEL->nb; + lda = PANEL->lda; + + Wr0 = (Wmx = WORK + 4) + NB; + Wmx[JJ] = gmax = WORK[0]; + + /* + * Replicated swap and copy of the current (new) row of A into L1 + */ + L = Mptr(PANEL->L1, 0, JJ, n0); + /* + * If the pivot is non-zero ... + */ + if(gmax != HPL_rzero) { + /* + * and if I own the current row of A ... + */ + if(myrow == PANEL->prow) { + /* + * and if I also own the row to be swapped with the current row of A ... + */ + if(myrow == (int)(WORK[3])) { + /* + * and if the current row of A is not to swapped with itself ... + */ + if((ilindx = (int)(WORK[1])) != 0) { + /* + * then copy the max row into L1 and locally swap the 2 rows of A. + */ + A1 = Mptr(PANEL->A, II, 0, lda); + A2 = Mptr(A1, ilindx, 0, lda); + + HPL_dcopy(n0, Wmx, 1, L, 1); + HPL_dcopy(n0, Wmx, 1, A1, lda); + HPL_dcopy(n0, Wr0, 1, A2, lda); + + } else { + /* + * otherwise the current row of A is swapped with itself, so just + * copy the current of A into L1. + */ + *Mptr(PANEL->A, II, JJ, lda) = gmax; + + HPL_dcopy(n0, Wmx, 1, L, 1); + } + + } else { + /* + * otherwise, the row to be swapped with the current row of A is in Wmx, + * so copy Wmx into L1 and A. + */ + A1 = Mptr(PANEL->A, II, 0, lda); + + HPL_dcopy(n0, Wmx, 1, L, 1); + HPL_dcopy(n0, Wmx, 1, A1, lda); + } + + } else { + /* + * otherwise I do not own the current row of A, so copy the max row Wmx + * into L1. + */ + HPL_dcopy(n0, Wmx, 1, L, 1); + + /* + * and if I own the max row, overwrite it with the current row Wr0. + */ + if(myrow == (int)(WORK[3])) { + A2 = Mptr(PANEL->A, II + (size_t)(WORK[1]), 0, lda); + + HPL_dcopy(n0, Wr0, 1, A2, lda); + } + } + } else { + /* + * Otherwise the max element in the current column is zero, simply copy + * the current row Wr0 into L1. The matrix is singular. + */ + HPL_dcopy(n0, Wr0, 1, L, 1); + + /* + * Set INFO. + */ + if(*(PANEL->DINFO) == 0.0) *(PANEL->DINFO) = (double)(PANEL->ia + JJ + 1); + } +} diff --git a/src/pfact/HPL_pdfact.cpp b/src/pfact/HPL_pdfact.cpp new file mode 100644 index 0000000..2e8f21a --- /dev/null +++ b/src/pfact/HPL_pdfact.cpp @@ -0,0 +1,109 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include + +void HPL_pdfact(HPL_T_panel* PANEL) { + /* + * Purpose + * ======= + * + * HPL_pdfact recursively factorizes a 1-dimensional panel of columns. + * The RPFACT function pointer specifies the recursive algorithm to be + * used, either Crout, Left- or Right looking. NBMIN allows to vary the + * recursive stopping criterium in terms of the number of columns in the + * panel, and NDIV allows to specify the number of subpanels each panel + * should be divided into. Usuallly a value of 2 will be chosen. Finally + * PFACT is a function pointer specifying the non-recursive algorithm to + * to be used on at most NBMIN columns. One can also choose here between + * Crout, Left- or Right looking. Empirical tests seem to indicate that + * values of 4 or 8 for NBMIN give the best results. + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * --------------------------------------------------------------------- + */ + + int jb, i; + + jb = PANEL->jb; + PANEL->n -= jb; + PANEL->ja += jb; + + if((PANEL->grid->mycol != PANEL->pcol) || (jb <= 0)) return; +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_RPFACT); +#endif + /* + * Factor the panel - Update the panel pointers + */ + double max_value[128]; + int max_index[128]; + + roctxRangePush("pdfact"); + +#pragma omp parallel shared(max_value, max_index) + { + const int thread_rank = omp_get_thread_num(); + const int thread_size = omp_get_num_threads(); + assert(thread_size <= 128); + + PANEL->algo->rffun(PANEL, + PANEL->mp, + jb, + 0, + PANEL->fWORK, + thread_rank, + thread_size, + max_value, + max_index); + } + + roctxRangePop(); + + // PANEL->A = Mptr( PANEL->A, 0, jb, PANEL->lda ); + PANEL->dA = Mptr(PANEL->dA, 0, jb, PANEL->dlda); + PANEL->nq -= jb; + PANEL->jj += jb; +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_RPFACT); +#endif +} diff --git a/src/pfact/HPL_pdmxswp.cpp b/src/pfact/HPL_pdmxswp.cpp new file mode 100644 index 0000000..dbcd366 --- /dev/null +++ b/src/pfact/HPL_pdmxswp.cpp @@ -0,0 +1,132 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdmxswp(HPL_T_panel* PANEL, + const int M, + const int II, + const int JJ, + double* WORK) { + /* + * Purpose + * ======= + * + * HPL_pdmxswp swaps and broadcasts the absolute value max row using + * bi-directional exchange. The buffer is partially set by HPL_dlocmax. + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by + * + * log_2( P ) * ( lat + ( 2 * N0 + 4 ) / bdwth ) + * + * where lat and bdwth are the latency and bandwidth of the network for + * double precision real elements. Communication only occurs in one + * process column. Mono-directional links will cause the communication + * cost to double. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of the matrix + * column on which this function operates. + * + * II (local input) const int + * On entry, II specifies the row offset where the column to be + * operated on starts with respect to the panel. + * + * JJ (local input) const int + * On entry, JJ specifies the column offset where the column to + * be operated on starts with respect to the panel. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2 * (4+2*N0). + * It is assumed that HPL_dlocmax was called prior to this + * routine to initialize the first four entries of this array. + * On exit, the N0 length max row is stored in WORK[4:4+N0-1]; + * Note that this is also the JJth row (or column) of L1. The + * remaining part is used as a temporary array. + * + * --------------------------------------------------------------------- + */ + + double * A0, *Wmx, *Wwork; + HPL_T_grid* grid; + MPI_Comm comm; + int cnt_, cnt0, i, icurrow, lda, myrow, n0; + +/* .. + * .. Executable Statements .. + */ +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_MXSWP); +#endif + grid = PANEL->grid; + comm = grid->col_comm; + myrow = grid->myrow; + n0 = PANEL->jb; + int NB = PANEL->nb; + icurrow = PANEL->prow; + /* + * Set up pointers in workspace: WORK and Wwork point to the beginning + * of the buffers of size 4 + 2*N0 to be combined. Wmx points to the row + * owning the local (before combine) and global (after combine) absolute + * value max. A0 points to the copy of the current row of the matrix. + */ + cnt0 = 4 + 2 * NB; + + A0 = (Wmx = WORK + 4) + NB; + Wwork = WORK + cnt0; + + /* + * Wmx[0:N0-1] := A[ilindx,0:N0-1] where ilindx is (int)(WORK[1]) (row + * with max in current column). If I am the current process row, pack in + * addition the current row of A in A0[0:N0-1]. If I do not own any row + * of A, then zero out Wmx[0:N0-1]. + */ + if(M > 0) { + lda = PANEL->lda; + + HPL_dcopy(n0, Mptr(PANEL->A, II + (int)(WORK[1]), 0, lda), lda, Wmx, 1); + if(myrow == icurrow) { + HPL_dcopy(n0, Mptr(PANEL->A, II, 0, lda), lda, A0, 1); + } else { + for(i = 0; i < n0; i++) A0[i] = HPL_rzero; + } + } else { + for(i = 0; i < n0; i++) A0[i] = HPL_rzero; + for(i = 0; i < n0; i++) Wmx[i] = HPL_rzero; + } + + /* Perform swap-broadcast */ + HPL_all_reduce_dmxswp(WORK, cnt0, icurrow, comm, Wwork); + + /* + * Save the global pivot index in pivot array + */ + (PANEL->ipiv)[JJ] = (int)WORK[2]; +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_MXSWP); +#endif +} diff --git a/src/pfact/HPL_pdpancrN.cpp b/src/pfact/HPL_pdpancrN.cpp new file mode 100644 index 0000000..003e266 --- /dev/null +++ b/src/pfact/HPL_pdpancrN.cpp @@ -0,0 +1,233 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdpancrN(HPL_T_panel* PANEL, + const int M, + const int N, + const int ICOFF, + double* WORK, + int thread_rank, + int thread_size, + double* max_value, + int* max_index) { + /* + * Purpose + * ======= + * + * HPL_pdpancrN factorizes a panel of columns that is a sub-array of a + * larger one-dimensional panel A using the Crout variant of the usual + * one-dimensional algorithm. The lower triangular N0-by-N0 upper block + * of the panel is stored in no-transpose form (i.e. just like the input + * matrix itself). + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Note that one iteration of the the main loop is unrolled. The local + * computation of the absolute value max of the next column is performed + * just after its update by the current column. This allows to bring the + * current column only once through cache at each step. The current + * implementation does not perform any blocking for this sequence of + * BLAS operations, however the design allows for plugging in an optimal + * (machine-specific) specialized BLAS-like kernel. This idea has been + * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of sub(A). + * + * N (local input) const int + * On entry, N specifies the local number of columns of sub(A). + * + * ICOFF (global input) const int + * On entry, ICOFF specifies the row and column offset of sub(A) + * in A. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2*(4+2*N0). + * + * --------------------------------------------------------------------- + */ + + double *A, *L1, *L1ptr; + int Mm1, Nm1, curr, ii, iip1, jj, kk = 0, lda, m = M, n0; +/* .. + * .. Executable Statements .. + */ +#ifdef HPL_DETAILED_TIMING + if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT); +#endif + A = PANEL->A; + lda = PANEL->lda; + L1 = PANEL->L1; + n0 = PANEL->jb; + curr = (int)(PANEL->grid->myrow == PANEL->prow); + + Nm1 = N - 1; + jj = ICOFF; + if(curr != 0) { + ii = ICOFF; + iip1 = ii + 1; + Mm1 = m - 1; + } else { + ii = 0; + iip1 = ii; + Mm1 = m; + } + + /* + * Find local absolute value max in first column - initialize WORK[0:3] + */ + HPL_dlocmax( + PANEL, m, ii, jj, WORK, thread_rank, thread_size, max_index, max_value); + + while(Nm1 > 0) { + /* + * Swap and broadcast the current row + */ + if(thread_rank == 0) { + HPL_pdmxswp(PANEL, m, ii, jj, WORK); + HPL_dlocswpN(PANEL, ii, jj, WORK); + } + /* + * Compute row (column) jj of L1 + */ + if(kk > 0) { + L1ptr = Mptr(L1, jj, jj + 1, n0); + + if(thread_rank == 0) { + HPL_dgemv(HplColumnMajor, + HplTrans, + kk, + Nm1, + -HPL_rone, + Mptr(L1, ICOFF, jj + 1, n0), + n0, + Mptr(L1, jj, ICOFF, n0), + n0, + HPL_rone, + L1ptr, + n0); + + if(curr != 0) HPL_dcopy(Nm1, L1ptr, n0, Mptr(A, ii, jj + 1, lda), lda); + } + } + +#pragma omp barrier + + /* + * Scale current column by its absolute value max entry - Update dia- + * diagonal and subdiagonal elements in column A(iip1:iip1+Mm1-1, jj+1) + * and find local absolute value max in that column (Only one pass + * through cache for each current column). This sequence of operations + * could benefit from a specialized blocked implementation. + */ + if(WORK[0] != HPL_rzero) + HPL_dscal_omp(Mm1, + HPL_rone / WORK[0], + Mptr(A, iip1, jj, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + + HPL_dgemv_omp(HplColumnMajor, + HplNoTrans, + Mm1, + kk + 1, + -HPL_rone, + Mptr(A, iip1, ICOFF, lda), + lda, + Mptr(L1, ICOFF, jj + 1, n0), + 1, + HPL_rone, + Mptr(A, iip1, jj + 1, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + + HPL_dlocmax(PANEL, + Mm1, + iip1, + jj + 1, + WORK, + thread_rank, + thread_size, + max_index, + max_value); + if(curr != 0) { + ii = iip1; + iip1++; + m = Mm1; + Mm1--; + } + + Nm1--; + jj++; + kk++; + } + /* + * Swap and broadcast last row - Scale last column by its absolute value + * max entry + */ + if(thread_rank == 0) { + HPL_pdmxswp(PANEL, m, ii, jj, WORK); + HPL_dlocswpN(PANEL, ii, jj, WORK); + } + +#pragma omp barrier + + if(WORK[0] != HPL_rzero) + HPL_dscal_omp(Mm1, + HPL_rone / WORK[0], + Mptr(A, iip1, jj, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + +#ifdef HPL_DETAILED_TIMING + if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT); +#endif +} diff --git a/src/pfact/HPL_pdpancrT.cpp b/src/pfact/HPL_pdpancrT.cpp new file mode 100644 index 0000000..84de5f5 --- /dev/null +++ b/src/pfact/HPL_pdpancrT.cpp @@ -0,0 +1,232 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdpancrT(HPL_T_panel* PANEL, + const int M, + const int N, + const int ICOFF, + double* WORK, + int thread_rank, + int thread_size, + double* max_value, + int* max_index) { + /* + * Purpose + * ======= + * + * HPL_pdpancrT factorizes a panel of columns that is a sub-array of a + * larger one-dimensional panel A using the Crout variant of the usual + * one-dimensional algorithm. The lower triangular N0-by-N0 upper block + * of the panel is stored in transpose form. + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Note that one iteration of the the main loop is unrolled. The local + * computation of the absolute value max of the next column is performed + * just after its update by the current column. This allows to bring the + * current column only once through cache at each step. The current + * implementation does not perform any blocking for this sequence of + * BLAS operations, however the design allows for plugging in an optimal + * (machine-specific) specialized BLAS-like kernel. This idea has been + * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of sub(A). + * + * N (local input) const int + * On entry, N specifies the local number of columns of sub(A). + * + * ICOFF (global input) const int + * On entry, ICOFF specifies the row and column offset of sub(A) + * in A. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2*(4+2*N0). + * + * --------------------------------------------------------------------- + */ + + double *A, *L1, *L1ptr; + int Mm1, Nm1, curr, ii, iip1, jj, kk = 0, lda, m = M, n0; +/* .. + * .. Executable Statements .. + */ +#ifdef HPL_DETAILED_TIMING + if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT); +#endif + A = PANEL->A; + lda = PANEL->lda; + L1 = PANEL->L1; + n0 = PANEL->jb; + curr = (int)(PANEL->grid->myrow == PANEL->prow); + + Nm1 = N - 1; + jj = ICOFF; + if(curr != 0) { + ii = ICOFF; + iip1 = ii + 1; + Mm1 = m - 1; + } else { + ii = 0; + iip1 = ii; + Mm1 = m; + } + + /* + * Find local absolute value max in first column - initialize WORK[0:3] + */ + HPL_dlocmax( + PANEL, m, ii, jj, WORK, thread_rank, thread_size, max_index, max_value); + + while(Nm1 > 0) { + /* + * Swap and broadcast the current row + */ + if(thread_rank == 0) { + HPL_pdmxswp(PANEL, m, ii, jj, WORK); + HPL_dlocswpT(PANEL, ii, jj, WORK); + } + /* + * Compute row (column) jj of L1 + */ + if(kk > 0) { + L1ptr = Mptr(L1, jj + 1, jj, n0); + + if(thread_rank == 0) { + HPL_dgemv(HplColumnMajor, + HplNoTrans, + Nm1, + kk, + -HPL_rone, + Mptr(L1, jj + 1, ICOFF, n0), + n0, + Mptr(L1, ICOFF, jj, n0), + 1, + HPL_rone, + L1ptr, + 1); + + if(curr != 0) HPL_dcopy(Nm1, L1ptr, 1, Mptr(A, ii, jj + 1, lda), lda); + } + } + +#pragma omp barrier + + /* + * Scale current column by its absolute value max entry - Update dia- + * diagonal and subdiagonal elements in column A(iip1:iip1+Mm1-1, jj+1) + * and find local absolute value max in that column (Only one pass + * through cache for each current column). This sequence of operations + * could benefit from a specialized blocked implementation. + */ + if(WORK[0] != HPL_rzero) + HPL_dscal_omp(Mm1, + HPL_rone / WORK[0], + Mptr(A, iip1, jj, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + + HPL_dgemv_omp(HplColumnMajor, + HplNoTrans, + Mm1, + kk + 1, + -HPL_rone, + Mptr(A, iip1, ICOFF, lda), + lda, + Mptr(L1, jj + 1, ICOFF, n0), + n0, + HPL_rone, + Mptr(A, iip1, jj + 1, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + + HPL_dlocmax(PANEL, + Mm1, + iip1, + jj + 1, + WORK, + thread_rank, + thread_size, + max_index, + max_value); + if(curr != 0) { + ii = iip1; + iip1++; + m = Mm1; + Mm1--; + } + + Nm1--; + jj++; + kk++; + } + /* + * Swap and broadcast last row - Scale last column by its absolute value + * max entry + */ + if(thread_rank == 0) { + HPL_pdmxswp(PANEL, m, ii, jj, WORK); + HPL_dlocswpT(PANEL, ii, jj, WORK); + } + +#pragma omp barrier + + if(WORK[0] != HPL_rzero) + HPL_dscal_omp(Mm1, + HPL_rone / WORK[0], + Mptr(A, iip1, jj, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + +#ifdef HPL_DETAILED_TIMING + if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT); +#endif +} diff --git a/src/pfact/HPL_pdpanllN.cpp b/src/pfact/HPL_pdpanllN.cpp new file mode 100644 index 0000000..5f27ea2 --- /dev/null +++ b/src/pfact/HPL_pdpanllN.cpp @@ -0,0 +1,224 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdpanllN(HPL_T_panel* PANEL, + const int M, + const int N, + const int ICOFF, + double* WORK, + int thread_rank, + int thread_size, + double* max_value, + int* max_index) { + /* + * Purpose + * ======= + * + * HPL_pdpanllN factorizes a panel of columns that is a sub-array of a + * larger one-dimensional panel A using the Left-looking variant of the + * usual one-dimensional algorithm. The lower triangular N0-by-N0 upper + * block of the panel is stored in no-transpose form (i.e. just like the + * input matrix itself). + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Note that one iteration of the the main loop is unrolled. The local + * computation of the absolute value max of the next column is performed + * just after its update by the current column. This allows to bring the + * current column only once through cache at each step. The current + * implementation does not perform any blocking for this sequence of + * BLAS operations, however the design allows for plugging in an optimal + * (machine-specific) specialized BLAS-like kernel. This idea has been + * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of sub(A). + * + * N (local input) const int + * On entry, N specifies the local number of columns of sub(A). + * + * ICOFF (global input) const int + * On entry, ICOFF specifies the row and column offset of sub(A) + * in A. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2*(4+2*N0). + * + * --------------------------------------------------------------------- + */ + + double *A, *L1, *L1ptr; + int Mm1, Nm1, curr, ii, iip1, jj, kk, lda, m = M, n0; +/* .. + * .. Executable Statements .. + */ +#ifdef HPL_DETAILED_TIMING + if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT); +#endif + A = PANEL->A; + lda = PANEL->lda; + L1 = PANEL->L1; + n0 = PANEL->jb; + curr = (int)(PANEL->grid->myrow == PANEL->prow); + + Nm1 = N - 1; + jj = ICOFF; + if(curr != 0) { + ii = ICOFF; + iip1 = ii + 1; + Mm1 = m - 1; + } else { + ii = 0; + iip1 = ii; + Mm1 = m; + } + + /* + * Find local absolute value max in first column and initialize WORK[0:3] + */ + HPL_dlocmax( + PANEL, m, ii, jj, WORK, thread_rank, thread_size, max_index, max_value); + + while(Nm1 > 0) { + /* + * Swap and broadcast the current row + */ + if(thread_rank == 0) { + HPL_pdmxswp(PANEL, m, ii, jj, WORK); + HPL_dlocswpN(PANEL, ii, jj, WORK); + } + + L1ptr = Mptr(L1, ICOFF, jj + 1, n0); + kk = jj + 1 - ICOFF; + if(thread_rank == 0) { + HPL_dtrsv(HplColumnMajor, + HplLower, + HplNoTrans, + HplUnit, + kk, + Mptr(L1, ICOFF, ICOFF, n0), + n0, + L1ptr, + 1); + } + +#pragma omp barrier + + /* + * Scale current column by its absolute value max entry - Update and + * find local absolute value max in next column (Only one pass through + * cache for each next column). This sequence of operations could bene- + * fit from a specialized blocked implementation. + */ + if(WORK[0] != HPL_rzero) + HPL_dscal_omp(Mm1, + HPL_rone / WORK[0], + Mptr(A, iip1, jj, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + + HPL_dgemv_omp(HplColumnMajor, + HplNoTrans, + Mm1, + kk, + -HPL_rone, + Mptr(A, iip1, ICOFF, lda), + lda, + L1ptr, + 1, + HPL_rone, + Mptr(A, iip1, jj + 1, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + + HPL_dlocmax(PANEL, + Mm1, + iip1, + jj + 1, + WORK, + thread_rank, + thread_size, + max_index, + max_value); + if(curr != 0) { + if(thread_rank == 0) { + HPL_dcopy(kk, L1ptr, 1, Mptr(A, ICOFF, jj + 1, lda), 1); + } + ii = iip1; + iip1++; + m = Mm1; + Mm1--; + } + Nm1--; + jj++; + } + /* + * Swap and broadcast last row - Scale last column by its absolute value + * max entry + */ + if(thread_rank == 0) { + HPL_pdmxswp(PANEL, m, ii, jj, WORK); + HPL_dlocswpN(PANEL, ii, jj, WORK); + } + +#pragma omp barrier + + if(WORK[0] != HPL_rzero) + HPL_dscal_omp(Mm1, + HPL_rone / WORK[0], + Mptr(A, iip1, jj, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + +#ifdef HPL_DETAILED_TIMING + if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT); +#endif +} diff --git a/src/pfact/HPL_pdpanllT.cpp b/src/pfact/HPL_pdpanllT.cpp new file mode 100644 index 0000000..c11d204 --- /dev/null +++ b/src/pfact/HPL_pdpanllT.cpp @@ -0,0 +1,223 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdpanllT(HPL_T_panel* PANEL, + const int M, + const int N, + const int ICOFF, + double* WORK, + int thread_rank, + int thread_size, + double* max_value, + int* max_index) { + /* + * Purpose + * ======= + * + * HPL_pdpanllT factorizes a panel of columns that is a sub-array of a + * larger one-dimensional panel A using the Left-looking variant of the + * usual one-dimensional algorithm. The lower triangular N0-by-N0 upper + * block of the panel is stored in transpose form. + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Note that one iteration of the the main loop is unrolled. The local + * computation of the absolute value max of the next column is performed + * just after its update by the current column. This allows to bring the + * current column only once through cache at each step. The current + * implementation does not perform any blocking for this sequence of + * BLAS operations, however the design allows for plugging in an optimal + * (machine-specific) specialized BLAS-like kernel. This idea has been + * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of sub(A). + * + * N (local input) const int + * On entry, N specifies the local number of columns of sub(A). + * + * ICOFF (global input) const int + * On entry, ICOFF specifies the row and column offset of sub(A) + * in A. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2*(4+2*N0). + * + * --------------------------------------------------------------------- + */ + + double *A, *L1, *L1ptr; + int Mm1, Nm1, curr, ii, iip1, jj, kk, lda, m = M, n0; +/* .. + * .. Executable Statements .. + */ +#ifdef HPL_DETAILED_TIMING + if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT); +#endif + A = PANEL->A; + lda = PANEL->lda; + L1 = PANEL->L1; + n0 = PANEL->jb; + curr = (int)(PANEL->grid->myrow == PANEL->prow); + + Nm1 = N - 1; + jj = ICOFF; + if(curr != 0) { + ii = ICOFF; + iip1 = ii + 1; + Mm1 = m - 1; + } else { + ii = 0; + iip1 = ii; + Mm1 = m; + } + + /* + * Find local absolute value max in first column and initialize WORK[0:3] + */ + HPL_dlocmax( + PANEL, m, ii, jj, WORK, thread_rank, thread_size, max_index, max_value); + + while(Nm1 > 0) { + /* + * Swap and broadcast the current row + */ + if(thread_rank == 0) { + HPL_pdmxswp(PANEL, m, ii, jj, WORK); + HPL_dlocswpT(PANEL, ii, jj, WORK); + } + + L1ptr = Mptr(L1, jj + 1, ICOFF, n0); + kk = jj + 1 - ICOFF; + if(thread_rank == 0) { + HPL_dtrsv(HplColumnMajor, + HplUpper, + HplTrans, + HplUnit, + kk, + Mptr(L1, ICOFF, ICOFF, n0), + n0, + L1ptr, + n0); + } + +#pragma omp barrier + + /* + * Scale current column by its absolute value max entry - Update and + * find local absolute value max in next column (Only one pass through + * cache for each next column). This sequence of operations could bene- + * fit from a specialized blocked implementation. + */ + if(WORK[0] != HPL_rzero) + HPL_dscal_omp(Mm1, + HPL_rone / WORK[0], + Mptr(A, iip1, jj, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + + HPL_dgemv_omp(HplColumnMajor, + HplNoTrans, + Mm1, + kk, + -HPL_rone, + Mptr(A, iip1, ICOFF, lda), + lda, + L1ptr, + n0, + HPL_rone, + Mptr(A, iip1, jj + 1, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + + HPL_dlocmax(PANEL, + Mm1, + iip1, + jj + 1, + WORK, + thread_rank, + thread_size, + max_index, + max_value); + if(curr != 0) { + if(thread_rank == 0) { + HPL_dcopy(kk, L1ptr, n0, Mptr(A, ICOFF, jj + 1, lda), 1); + } + ii = iip1; + iip1++; + m = Mm1; + Mm1--; + } + Nm1--; + jj++; + } + /* + * Swap and broadcast last row - Scale last column by its absolute value + * max entry + */ + if(thread_rank == 0) { + HPL_pdmxswp(PANEL, m, ii, jj, WORK); + HPL_dlocswpT(PANEL, ii, jj, WORK); + } + +#pragma omp barrier + + if(WORK[0] != HPL_rzero) + HPL_dscal_omp(Mm1, + HPL_rone / WORK[0], + Mptr(A, iip1, jj, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + +#ifdef HPL_DETAILED_TIMING + if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT); +#endif +} diff --git a/src/pfact/HPL_pdpanrlN.cpp b/src/pfact/HPL_pdpanrlN.cpp new file mode 100644 index 0000000..c341e1d --- /dev/null +++ b/src/pfact/HPL_pdpanrlN.cpp @@ -0,0 +1,228 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdpanrlN(HPL_T_panel* PANEL, + const int M, + const int N, + const int ICOFF, + double* WORK, + int thread_rank, + int thread_size, + double* max_value, + int* max_index) { + /* + * Purpose + * ======= + * + * HPL_pdpanrlN factorizes a panel of columns that is a sub-array of a + * larger one-dimensional panel A using the Right-looking variant of the + * usual one-dimensional algorithm. The lower triangular N0-by-N0 upper + * block of the panel is stored in no-transpose form (i.e. just like the + * input matrix itself). + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Note that one iteration of the the main loop is unrolled. The local + * computation of the absolute value max of the next column is performed + * just after its update by the current column. This allows to bring the + * current column only once through cache at each step. The current + * implementation does not perform any blocking for this sequence of + * BLAS operations, however the design allows for plugging in an optimal + * (machine-specific) specialized BLAS-like kernel. This idea has been + * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of sub(A). + * + * N (local input) const int + * On entry, N specifies the local number of columns of sub(A). + * + * ICOFF (global input) const int + * On entry, ICOFF specifies the row and column offset of sub(A) + * in A. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2*(4+2*N0). + * + * --------------------------------------------------------------------- + */ + + double *A, *Acur, *Anxt; + int Mm1, Nm1, curr, ii, iip1, jj, lda, m = M; +/* .. + * .. Executable Statements .. + */ +#ifdef HPL_DETAILED_TIMING + if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT); +#endif + A = PANEL->A; + lda = PANEL->lda; + curr = (int)(PANEL->grid->myrow == PANEL->prow); + + Nm1 = N - 1; + jj = ICOFF; + if(curr != 0) { + ii = ICOFF; + iip1 = ii + 1; + Mm1 = m - 1; + } else { + ii = 0; + iip1 = ii; + Mm1 = m; + } + /* + * Find local absolute value max in first column - initialize WORK[0:3] + */ + HPL_dlocmax( + PANEL, m, ii, jj, WORK, thread_rank, thread_size, max_index, max_value); + + while(Nm1 >= 1) { + Acur = Mptr(A, iip1, jj, lda); + Anxt = Mptr(Acur, 0, 1, lda); + /* + * Swap and broadcast the current row + */ + if(thread_rank == 0) { + HPL_pdmxswp(PANEL, m, ii, jj, WORK); + HPL_dlocswpN(PANEL, ii, jj, WORK); + } + +#pragma omp barrier + + /* + * Scale current column by its absolute value max entry - Update trai- + * ling sub-matrix and find local absolute value max in next column (On- + * ly one pass through cache for each current column). This sequence of + * operations could benefit from a specialized blocked implementation. + */ + if(WORK[0] != HPL_rzero) + HPL_dscal_omp(Mm1, + HPL_rone / WORK[0], + Acur, + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + + HPL_daxpy_omp(Mm1, + -WORK[4 + jj + 1], + Acur, + 1, + Anxt, + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + + HPL_dlocmax(PANEL, + Mm1, + iip1, + jj + 1, + WORK, + thread_rank, + thread_size, + max_index, + max_value); + + if(Nm1 > 1) + HPL_dger_omp(HplColumnMajor, + Mm1, + Nm1 - 1, + -HPL_rone, + Acur, + 1, + WORK + 4 + jj + 2, + 1, + Mptr(Anxt, 0, 1, lda), + lda, + PANEL->nb, + iip1, + thread_rank, + thread_size); + +#pragma omp barrier + + /* + * Same thing as above but with worse data access on y (A += x * y^T) + * + * if( Nm1 > 1 ) ) + * HPL_dger( HplColumnMajor, Mm1, Nm1-1, -HPL_rone, Acur, 1, + * Mptr( L1, jj, jj+2, n0 ), n0, Mptr( Anxt, 0, 1, lda ), + * lda ); + */ + if(curr != 0) { + ii = iip1; + iip1++; + m = Mm1; + Mm1--; + } + + Nm1--; + jj++; + } + /* + * Swap and broadcast last row - Scale last column by its absolute value + * max entry + */ + if(thread_rank == 0) { + HPL_pdmxswp(PANEL, m, ii, jj, WORK); + HPL_dlocswpN(PANEL, ii, jj, WORK); + } + +#pragma omp barrier + + if(WORK[0] != HPL_rzero) + HPL_dscal_omp(Mm1, + HPL_rone / WORK[0], + Mptr(A, iip1, jj, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + +#ifdef HPL_DETAILED_TIMING + if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT); +#endif +} diff --git a/src/pfact/HPL_pdpanrlT.cpp b/src/pfact/HPL_pdpanrlT.cpp new file mode 100644 index 0000000..3052223 --- /dev/null +++ b/src/pfact/HPL_pdpanrlT.cpp @@ -0,0 +1,224 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdpanrlT(HPL_T_panel* PANEL, + const int M, + const int N, + const int ICOFF, + double* WORK, + int thread_rank, + int thread_size, + double* max_value, + int* max_index) { + /* + * Purpose + * ======= + * + * HPL_pdpanrlT factorizes a panel of columns that is a sub-array of a + * larger one-dimensional panel A using the Right-looking variant of the + * usual one-dimensional algorithm. The lower triangular N0-by-N0 upper + * block of the panel is stored in transpose form. + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Note that one iteration of the the main loop is unrolled. The local + * computation of the absolute value max of the next column is performed + * just after its update by the current column. This allows to bring the + * current column only once through cache at each step. The current + * implementation does not perform any blocking for this sequence of + * BLAS operations, however the design allows for plugging in an optimal + * (machine-specific) specialized BLAS-like kernel. This idea has been + * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of sub(A). + * + * N (local input) const int + * On entry, N specifies the local number of columns of sub(A). + * + * ICOFF (global input) const int + * On entry, ICOFF specifies the row and column offset of sub(A) + * in A. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2*(4+2*N0). + * + * --------------------------------------------------------------------- + */ + + double *A, *Acur, *Anxt, *L1; + int Mm1, Nm1, curr, ii, iip1, jj, lda, m = M, n0; +/* .. + * .. Executable Statements .. + */ +#ifdef HPL_DETAILED_TIMING + if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT); +#endif + A = PANEL->A; + lda = PANEL->lda; + L1 = PANEL->L1; + n0 = PANEL->jb; + curr = (int)(PANEL->grid->myrow == PANEL->prow); + + Nm1 = N - 1; + jj = ICOFF; + if(curr != 0) { + ii = ICOFF; + iip1 = ii + 1; + Mm1 = m - 1; + } else { + ii = 0; + iip1 = ii; + Mm1 = m; + } + + /* + * Find local absolute value max in first column - initialize WORK[0:3] + */ + HPL_dlocmax( + PANEL, m, ii, jj, WORK, thread_rank, thread_size, max_index, max_value); + + while(Nm1 >= 1) { + Acur = Mptr(A, iip1, jj, lda); + Anxt = Mptr(Acur, 0, 1, lda); + /* + * Swap and broadcast the current row + */ + if(thread_rank == 0) { + HPL_pdmxswp(PANEL, m, ii, jj, WORK); + HPL_dlocswpT(PANEL, ii, jj, WORK); + } + +#pragma omp barrier + + /* + * Scale current column by its absolute value max entry - Update trai- + * ling sub-matrix and find local absolute value max in next column (On- + * ly one pass through cache for each current column). This sequence of + * operations could benefit from a specialized blocked implementation. + */ + if(WORK[0] != HPL_rzero) + HPL_dscal_omp(Mm1, + HPL_rone / WORK[0], + Acur, + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + + HPL_daxpy_omp(Mm1, + -(*(Mptr(L1, jj + 1, jj, n0))), + Acur, + 1, + Anxt, + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + + HPL_dlocmax(PANEL, + Mm1, + iip1, + jj + 1, + WORK, + thread_rank, + thread_size, + max_index, + max_value); + + if(Nm1 > 1) { + + HPL_dger_omp(HplColumnMajor, + Mm1, + Nm1 - 1, + -HPL_rone, + Acur, + 1, + Mptr(L1, jj + 2, jj, n0), + 1, + Mptr(Anxt, 0, 1, lda), + lda, + PANEL->nb, + iip1, + thread_rank, + thread_size); + } + +#pragma omp barrier + + if(curr != 0) { + ii = iip1; + iip1++; + m = Mm1; + Mm1--; + } + + Nm1--; + jj++; + } + /* + * Swap and broadcast last row - Scale last column by its absolute value + * max entry + */ + if(thread_rank == 0) { + HPL_pdmxswp(PANEL, m, ii, jj, WORK); + HPL_dlocswpT(PANEL, ii, jj, WORK); + } + +#pragma omp barrier + + if(WORK[0] != HPL_rzero) + HPL_dscal_omp(Mm1, + HPL_rone / WORK[0], + Mptr(A, iip1, jj, lda), + 1, + PANEL->nb, + iip1, + thread_rank, + thread_size); + +#ifdef HPL_DETAILED_TIMING + if(thread_rank == 0) HPL_ptimer(HPL_TIMING_PFACT); +#endif +} diff --git a/src/pfact/HPL_pdrpancrN.cpp b/src/pfact/HPL_pdrpancrN.cpp new file mode 100644 index 0000000..b5389a9 --- /dev/null +++ b/src/pfact/HPL_pdrpancrN.cpp @@ -0,0 +1,214 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdrpancrN(HPL_T_panel* PANEL, + const int M, + const int N, + const int ICOFF, + double* WORK, + int thread_rank, + int thread_size, + double* max_value, + int* max_index) { + /* + * Purpose + * ======= + * + * HPL_pdrpancrN HPL_pdrpancrN recursively factorizes a panel of columns + * using the recursive Crout variant of the usual one-dimensional + * algorithm. The lower triangular N0-by-N0 upper block of the panel is + * stored in no-transpose form (i.e. just like the input matrix itself). + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of sub(A). + * + * N (local input) const int + * On entry, N specifies the local number of columns of sub(A). + * + * ICOFF (global input) const int + * On entry, ICOFF specifies the row and column offset of sub(A) + * in A. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2*(4+2*N0). + * + * --------------------------------------------------------------------- + */ + + double *A, *Aptr, *L1, *L1ptr; + int curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin; + + if(N <= (nbmin = PANEL->algo->nbmin)) { + PANEL->algo->pffun(PANEL, + M, + N, + ICOFF, + WORK, + thread_rank, + thread_size, + max_value, + max_index); + return; + } + /* + * Find new recursive blocking factor. To avoid an infinite loop, one + * must guarantee: 1 <= jb < N, knowing that N is greater than NBMIN. + * First, we compute nblocks: the number of blocks of size NBMIN in N, + * including the last one that may be smaller. nblocks is thus larger + * than or equal to one, since N >= NBMIN. + * The ratio ( nblocks + NDIV - 1 ) / NDIV is thus larger than or equal + * to one as well. For NDIV >= 2, we are guaranteed that the quan- + * tity ( ( nblocks + NDIV - 1 ) / NDIV ) * NBMIN is less than N and + * greater than or equal to NBMIN. + */ + nbdiv = PANEL->algo->nbdiv; + ii = jj = 0; + m = M; + n = N; + nb = jb = ((((N + nbmin - 1) / nbmin) + nbdiv - 1) / nbdiv) * nbmin; + + A = PANEL->A; + lda = PANEL->lda; + L1 = PANEL->L1; + n0 = PANEL->jb; + L1ptr = Mptr(L1, ICOFF, ICOFF, n0); + curr = (int)(PANEL->grid->myrow == PANEL->prow); + + if(curr != 0) + Aptr = Mptr(A, ICOFF, ICOFF, lda); + else + Aptr = Mptr(A, 0, ICOFF, lda); + /* + * The triangular solve is replicated in every process row. The panel + * factorization is such that the first rows of A are accumulated in + * every process row during the (panel) swapping phase. We ensure this + * way a minimum amount of communication during the entire panel facto- + * rization. + */ + do { + n -= jb; + ioff = ICOFF + jj; + +#pragma omp barrier + + /* + * Local update - Factor current panel - Replicated update and solve + */ + HPL_dgemm_omp(HplColumnMajor, + HplNoTrans, + HplNoTrans, + m, + jb, + jj, + -HPL_rone, + Mptr(Aptr, ii, 0, lda), + lda, + Mptr(L1ptr, 0, jj, n0), + n0, + HPL_rone, + Mptr(Aptr, ii, jj, lda), + lda, + PANEL->nb, + (curr != 0) ? ICOFF + ii : 0, + thread_rank, + thread_size); + + HPL_pdrpancrN(PANEL, + m, + jb, + ioff, + WORK, + thread_rank, + thread_size, + max_value, + max_index); + + if(n > 0) { + + if(thread_rank == 0) { + HPL_dgemm(HplColumnMajor, + HplNoTrans, + HplNoTrans, + jb, + n, + jj, + -HPL_rone, + Mptr(L1ptr, jj, 0, n0), + n0, + Mptr(L1ptr, 0, jj + jb, n0), + n0, + HPL_rone, + Mptr(L1ptr, jj, jj + jb, n0), + n0); + + HPL_dtrsm(HplColumnMajor, + HplLeft, + HplLower, + HplNoTrans, + HplUnit, + jb, + n, + HPL_rone, + Mptr(L1ptr, jj, jj, n0), + n0, + Mptr(L1ptr, jj, jj + jb, n0), + n0); + } + } + /* + * Copy back upper part of A in current process row - Go the next block + */ + if(curr != 0) { + if(thread_rank == 0) { + HPL_dlacpy( + ioff, jb, Mptr(L1, 0, ioff, n0), n0, Mptr(A, 0, ioff, lda), lda); + } + ii += jb; + m -= jb; + } + jj += jb; + jb = Mmin(n, nb); + + } while(n > 0); +} diff --git a/src/pfact/HPL_pdrpancrT.cpp b/src/pfact/HPL_pdrpancrT.cpp new file mode 100644 index 0000000..7694cb5 --- /dev/null +++ b/src/pfact/HPL_pdrpancrT.cpp @@ -0,0 +1,213 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdrpancrT(HPL_T_panel* PANEL, + const int M, + const int N, + const int ICOFF, + double* WORK, + int thread_rank, + int thread_size, + double* max_value, + int* max_index) { + /* + * Purpose + * ======= + * + * HPL_pdrpancrT recursively factorizes a panel of columns using the + * recursive Crout variant of the usual one-dimensional algorithm. + * The lower triangular N0-by-N0 upper block of the panel is stored in + * transpose form. + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of sub(A). + * + * N (local input) const int + * On entry, N specifies the local number of columns of sub(A). + * + * ICOFF (global input) const int + * On entry, ICOFF specifies the row and column offset of sub(A) + * in A. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2*(4+2*N0). + * + * --------------------------------------------------------------------- + */ + + double *A, *Aptr, *L1, *L1ptr; + int curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin; + + if(N <= (nbmin = PANEL->algo->nbmin)) { + PANEL->algo->pffun(PANEL, + M, + N, + ICOFF, + WORK, + thread_rank, + thread_size, + max_value, + max_index); + return; + } + /* + * Find new recursive blocking factor. To avoid an infinite loop, one + * must guarantee: 1 <= jb < N, knowing that N is greater than NBMIN. + * First, we compute nblocks: the number of blocks of size NBMIN in N, + * including the last one that may be smaller. nblocks is thus larger + * than or equal to one, since N >= NBMIN. + * The ratio ( nblocks + NDIV - 1 ) / NDIV is thus larger than or equal + * to one as well. For NDIV >= 2, we are guaranteed that the quan- + * tity ( ( nblocks + NDIV - 1 ) / NDIV ) * NBMIN is less than N and + * greater than or equal to NBMIN. + */ + nbdiv = PANEL->algo->nbdiv; + ii = jj = 0; + m = M; + n = N; + nb = jb = ((((N + nbmin - 1) / nbmin) + nbdiv - 1) / nbdiv) * nbmin; + + A = PANEL->A; + lda = PANEL->lda; + L1 = PANEL->L1; + n0 = PANEL->jb; + L1ptr = Mptr(L1, ICOFF, ICOFF, n0); + curr = (int)(PANEL->grid->myrow == PANEL->prow); + + if(curr != 0) + Aptr = Mptr(A, ICOFF, ICOFF, lda); + else + Aptr = Mptr(A, 0, ICOFF, lda); + /* + * The triangular solve is replicated in every process row. The panel + * factorization is such that the first rows of A are accumulated in + * every process row during the (panel) swapping phase. We ensure this + * way a minimum amount of communication during the entire panel facto- + * rization. + */ + do { + n -= jb; + ioff = ICOFF + jj; + +#pragma omp barrier + + /* + * Local update - Factor current panel - Replicated update and solve + */ + HPL_dgemm_omp(HplColumnMajor, + HplNoTrans, + HplTrans, + m, + jb, + jj, + -HPL_rone, + Mptr(Aptr, ii, 0, lda), + lda, + Mptr(L1ptr, jj, 0, n0), + n0, + HPL_rone, + Mptr(Aptr, ii, jj, lda), + lda, + PANEL->nb, + (curr != 0) ? ICOFF + ii : 0, + thread_rank, + thread_size); + + HPL_pdrpancrT(PANEL, + m, + jb, + ioff, + WORK, + thread_rank, + thread_size, + max_value, + max_index); + + if(n > 0) { + if(thread_rank == 0) { + HPL_dgemm(HplColumnMajor, + HplNoTrans, + HplNoTrans, + n, + jb, + jj, + -HPL_rone, + Mptr(L1ptr, jj + jb, 0, n0), + n0, + Mptr(L1ptr, 0, jj, n0), + n0, + HPL_rone, + Mptr(L1ptr, jj + jb, jj, n0), + n0); + + HPL_dtrsm(HplColumnMajor, + HplRight, + HplUpper, + HplNoTrans, + HplUnit, + n, + jb, + HPL_rone, + Mptr(L1ptr, jj, jj, n0), + n0, + Mptr(L1ptr, jj + jb, jj, n0), + n0); + } + } + /* + * Copy back upper part of A in current process row - Go the next block + */ + if(curr != 0) { + if(thread_rank == 0) { + HPL_dlatcpy( + ioff, jb, Mptr(L1, ioff, 0, n0), n0, Mptr(A, 0, ioff, lda), lda); + } + ii += jb; + m -= jb; + } + jj += jb; + jb = Mmin(n, nb); + + } while(n > 0); +} diff --git a/src/pfact/HPL_pdrpanllN.cpp b/src/pfact/HPL_pdrpanllN.cpp new file mode 100644 index 0000000..4ed3a61 --- /dev/null +++ b/src/pfact/HPL_pdrpanllN.cpp @@ -0,0 +1,193 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdrpanllN(HPL_T_panel* PANEL, + const int M, + const int N, + const int ICOFF, + double* WORK, + int thread_rank, + int thread_size, + double* max_value, + int* max_index) { + /* + * Purpose + * ======= + * + * HPL_pdrpanllN recursively factorizes a panel of columns using the + * recursive Left-looking variant of the one-dimensional algorithm. The + * lower triangular N0-by-N0 upper block of the panel is stored in + * no-transpose form (i.e. just like the input matrix itself). + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of sub(A). + * + * N (local input) const int + * On entry, N specifies the local number of columns of sub(A). + * + * ICOFF (global input) const int + * On entry, ICOFF specifies the row and column offset of sub(A) + * in A. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2*(4+2*N0). + * + * --------------------------------------------------------------------- + */ + + double *A, *Aptr, *L1, *L1ptr; + int curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin; + + if(N <= (nbmin = PANEL->algo->nbmin)) { + PANEL->algo->pffun(PANEL, + M, + N, + ICOFF, + WORK, + thread_rank, + thread_size, + max_value, + max_index); + return; + } + /* + * Find new recursive blocking factor. To avoid an infinite loop, one + * must guarantee: 1 <= jb < N, knowing that N is greater than NBMIN. + * First, we compute nblocks: the number of blocks of size NBMIN in N, + * including the last one that may be smaller. nblocks is thus larger + * than or equal to one, since N >= NBMIN. + * The ratio ( nblocks + NDIV - 1 ) / NDIV is thus larger than or equal + * to one as well. For NDIV >= 2, we are guaranteed that the quan- + * tity ( ( nblocks + NDIV - 1 ) / NDIV ) * NBMIN is less than N and + * greater than or equal to NBMIN. + */ + nbdiv = PANEL->algo->nbdiv; + ii = jj = 0; + m = M; + n = N; + nb = jb = ((((N + nbmin - 1) / nbmin) + nbdiv - 1) / nbdiv) * nbmin; + + A = PANEL->A; + lda = PANEL->lda; + L1 = PANEL->L1; + n0 = PANEL->jb; + L1ptr = Mptr(L1, ICOFF, ICOFF, n0); + curr = (int)(PANEL->grid->myrow == PANEL->prow); + + if(curr != 0) + Aptr = Mptr(A, ICOFF, ICOFF, lda); + else + Aptr = Mptr(A, 0, ICOFF, lda); + /* + * The triangular solve is replicated in every process row. The panel + * factorization is such that the first rows of A are accumulated in + * every process row during the (panel) swapping phase. We ensure this + * way a minimum amount of communication during the entire panel facto- + * rization. + */ + do { + n -= jb; + ioff = ICOFF + jj; + /* + * Replicated solve - Local update - Factor current panel + */ + if(thread_rank == 0) { + HPL_dtrsm(HplColumnMajor, + HplLeft, + HplLower, + HplNoTrans, + HplUnit, + jj, + jb, + HPL_rone, + L1ptr, + n0, + Mptr(L1ptr, 0, jj, n0), + n0); + } + + HPL_dgemm_omp(HplColumnMajor, + HplNoTrans, + HplNoTrans, + m, + jb, + jj, + -HPL_rone, + Mptr(Aptr, ii, 0, lda), + lda, + Mptr(L1ptr, 0, jj, n0), + n0, + HPL_rone, + Mptr(Aptr, ii, jj, lda), + lda, + PANEL->nb, + (curr != 0) ? ICOFF + ii : 0, + thread_rank, + thread_size); + + HPL_pdrpanllN(PANEL, + m, + jb, + ioff, + WORK, + thread_rank, + thread_size, + max_value, + max_index); + /* + * Copy back upper part of A in current process row - Go the next block + */ + if(curr != 0) { + if(thread_rank == 0) { + HPL_dlacpy( + ioff, jb, Mptr(L1, 0, ioff, n0), n0, Mptr(A, 0, ioff, lda), lda); + } + ii += jb; + m -= jb; + } + jj += jb; + jb = Mmin(n, nb); + + } while(n > 0); +} diff --git a/src/pfact/HPL_pdrpanllT.cpp b/src/pfact/HPL_pdrpanllT.cpp new file mode 100644 index 0000000..df92e4a --- /dev/null +++ b/src/pfact/HPL_pdrpanllT.cpp @@ -0,0 +1,193 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdrpanllT(HPL_T_panel* PANEL, + const int M, + const int N, + const int ICOFF, + double* WORK, + int thread_rank, + int thread_size, + double* max_value, + int* max_index) { + /* + * Purpose + * ======= + * + * HPL_pdrpanllT recursively factorizes a panel of columns using the + * recursive Left-looking variant of the one-dimensional algorithm. The + * lower triangular N0-by-N0 upper block of the panel is stored in + * transpose form. + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of sub(A). + * + * N (local input) const int + * On entry, N specifies the local number of columns of sub(A). + * + * ICOFF (global input) const int + * On entry, ICOFF specifies the row and column offset of sub(A) + * in A. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2*(4+2*N0). + * + * --------------------------------------------------------------------- + */ + + double *A, *Aptr, *L1, *L1ptr; + int curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin; + + if(N <= (nbmin = PANEL->algo->nbmin)) { + PANEL->algo->pffun(PANEL, + M, + N, + ICOFF, + WORK, + thread_rank, + thread_size, + max_value, + max_index); + return; + } + /* + * Find new recursive blocking factor. To avoid an infinite loop, one + * must guarantee: 1 <= jb < N, knowing that N is greater than NBMIN. + * First, we compute nblocks: the number of blocks of size NBMIN in N, + * including the last one that may be smaller. nblocks is thus larger + * than or equal to one, since N >= NBMIN. + * The ratio ( nblocks + NDIV - 1 ) / NDIV is thus larger than or equal + * to one as well. For NDIV >= 2, we are guaranteed that the quan- + * tity ( ( nblocks + NDIV - 1 ) / NDIV ) * NBMIN is less than N and + * greater than or equal to NBMIN. + */ + nbdiv = PANEL->algo->nbdiv; + ii = jj = 0; + m = M; + n = N; + nb = jb = ((((N + nbmin - 1) / nbmin) + nbdiv - 1) / nbdiv) * nbmin; + + A = PANEL->A; + lda = PANEL->lda; + L1 = PANEL->L1; + n0 = PANEL->jb; + L1ptr = Mptr(L1, ICOFF, ICOFF, n0); + curr = (int)(PANEL->grid->myrow == PANEL->prow); + + if(curr != 0) + Aptr = Mptr(A, ICOFF, ICOFF, lda); + else + Aptr = Mptr(A, 0, ICOFF, lda); + /* + * The triangular solve is replicated in every process row. The panel + * factorization is such that the first rows of A are accumulated in + * every process row during the (panel) swapping phase. We ensure this + * way a minimum amount of communication during the entire panel facto- + * rization. + */ + do { + n -= jb; + ioff = ICOFF + jj; + /* + * Replicated solve - Local update - Factor current panel + */ + if(thread_rank == 0) { + HPL_dtrsm(HplColumnMajor, + HplRight, + HplUpper, + HplNoTrans, + HplUnit, + jb, + jj, + HPL_rone, + L1ptr, + n0, + Mptr(L1ptr, jj, 0, n0), + n0); + } + + HPL_dgemm_omp(HplColumnMajor, + HplNoTrans, + HplTrans, + m, + jb, + jj, + -HPL_rone, + Mptr(Aptr, ii, 0, lda), + lda, + Mptr(L1ptr, jj, 0, n0), + n0, + HPL_rone, + Mptr(Aptr, ii, jj, lda), + lda, + PANEL->nb, + (curr != 0) ? ICOFF + ii : 0, + thread_rank, + thread_size); + + HPL_pdrpanllT(PANEL, + m, + jb, + ioff, + WORK, + thread_rank, + thread_size, + max_value, + max_index); + /* + * Copy back upper part of A in current process row - Go the next block + */ + if(curr != 0) { + if(thread_rank == 0) { + HPL_dlatcpy( + ioff, jb, Mptr(L1, ioff, 0, n0), n0, Mptr(A, 0, ioff, lda), lda); + } + ii += jb; + m -= jb; + } + jj += jb; + jb = Mmin(n, nb); + + } while(n > 0); +} diff --git a/src/pfact/HPL_pdrpanrlN.cpp b/src/pfact/HPL_pdrpanrlN.cpp new file mode 100644 index 0000000..33524b7 --- /dev/null +++ b/src/pfact/HPL_pdrpanrlN.cpp @@ -0,0 +1,198 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdrpanrlN(HPL_T_panel* PANEL, + const int M, + const int N, + const int ICOFF, + double* WORK, + int thread_rank, + int thread_size, + double* max_value, + int* max_index) { + /* + * Purpose + * ======= + * + * HPL_pdrpanrlN recursively factorizes a panel of columns using the + * recursive Right-looking variant of the one-dimensional algorithm. The + * lower triangular N0-by-N0 upper block of the panel is stored in + * no-transpose form (i.e. just like the input matrix itself). + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of sub(A). + * + * N (local input) const int + * On entry, N specifies the local number of columns of sub(A). + * + * ICOFF (global input) const int + * On entry, ICOFF specifies the row and column offset of sub(A) + * in A. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2*(4+2*N0). + * + * --------------------------------------------------------------------- + */ + + double *A, *Aptr, *L1, *L1ptr; + int curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin; + + if(N <= (nbmin = PANEL->algo->nbmin)) { + PANEL->algo->pffun(PANEL, + M, + N, + ICOFF, + WORK, + thread_rank, + thread_size, + max_value, + max_index); + return; + } + /* + * Find new recursive blocking factor. To avoid an infinite loop, one + * must guarantee: 1 <= jb < N, knowing that N is greater than NBMIN. + * First, we compute nblocks: the number of blocks of size NBMIN in N, + * including the last one that may be smaller. nblocks is thus larger + * than or equal to one, since N >= NBMIN. + * The ratio ( nblocks + NDIV - 1 ) / NDIV is thus larger than or equal + * to one as well. For NDIV >= 2, we are guaranteed that the quan- + * tity ( ( nblocks + NDIV - 1 ) / NDIV ) * NBMIN is less than N and + * greater than or equal to NBMIN. + */ + nbdiv = PANEL->algo->nbdiv; + ii = jj = 0; + m = M; + n = N; + nb = jb = ((((N + nbmin - 1) / nbmin) + nbdiv - 1) / nbdiv) * nbmin; + + A = PANEL->A; + lda = PANEL->lda; + L1 = PANEL->L1; + n0 = PANEL->jb; + L1ptr = Mptr(L1, ICOFF, ICOFF, n0); + curr = (int)(PANEL->grid->myrow == PANEL->prow); + + if(curr != 0) + Aptr = Mptr(A, ICOFF, ICOFF, lda); + else + Aptr = Mptr(A, 0, ICOFF, lda); + /* + * The triangular solve is replicated in every process row. The panel + * factorization is such that the first rows of A are accumulated in + * every process row during the (panel) swapping phase. We ensure this + * way a minimum amount of communication during the entire panel facto- + * rization. + */ + do { + n -= jb; + ioff = ICOFF + jj; + /* + * Factor current panel - Replicated solve - Local update + */ + HPL_pdrpanrlN(PANEL, + m, + jb, + ioff, + WORK, + thread_rank, + thread_size, + max_value, + max_index); + + if(thread_rank == 0) { + HPL_dtrsm(HplColumnMajor, + HplLeft, + HplLower, + HplNoTrans, + HplUnit, + jb, + n, + HPL_rone, + Mptr(L1ptr, jj, jj, n0), + n0, + Mptr(L1ptr, jj, jj + jb, n0), + n0); + } + if(curr != 0) { + ii += jb; + m -= jb; + } + +#pragma omp barrier + + HPL_dgemm_omp(HplColumnMajor, + HplNoTrans, + HplNoTrans, + m, + n, + jb, + -HPL_rone, + Mptr(Aptr, ii, jj, lda), + lda, + Mptr(L1ptr, jj, jj + jb, n0), + n0, + HPL_rone, + Mptr(Aptr, ii, jj + jb, lda), + lda, + PANEL->nb, + (curr != 0) ? ICOFF + ii : 0, + thread_rank, + thread_size); + + /* + * Copy back upper part of A in current process row - Go the next block + */ + if(curr != 0) { + if(thread_rank == 0) { + HPL_dlacpy( + ioff, jb, Mptr(L1, 0, ioff, n0), n0, Mptr(A, 0, ioff, lda), lda); + } + } + jj += jb; + jb = Mmin(n, nb); + + } while(n > 0); +} diff --git a/src/pfact/HPL_pdrpanrlT.cpp b/src/pfact/HPL_pdrpanrlT.cpp new file mode 100644 index 0000000..8a93391 --- /dev/null +++ b/src/pfact/HPL_pdrpanrlT.cpp @@ -0,0 +1,198 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdrpanrlT(HPL_T_panel* PANEL, + const int M, + const int N, + const int ICOFF, + double* WORK, + int thread_rank, + int thread_size, + double* max_value, + int* max_index) { + /* + * Purpose + * ======= + * + * HPL_pdrpanrlT recursively factorizes a panel of columns using the + * recursive Right-looking variant of the one-dimensional algorithm. The + * lower triangular N0-by-N0 upper block of the panel is stored in + * transpose form. + * + * Bi-directional exchange is used to perform the swap::broadcast + * operations at once for one column in the panel. This results in a + * lower number of slightly larger messages than usual. On P processes + * and assuming bi-directional links, the running time of this function + * can be approximated by (when N is equal to N0): + * + * N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) + + * N0^2 * ( M - N0/3 ) * gam2-3 + * + * where M is the local number of rows of the panel, lat and bdwth are + * the latency and bandwidth of the network for double precision real + * words, and gam2-3 is an estimate of the Level 2 and Level 3 BLAS + * rate of execution. The recursive algorithm allows indeed to almost + * achieve Level 3 BLAS performance in the panel factorization. On a + * large number of modern machines, this operation is however latency + * bound, meaning that its cost can be estimated by only the latency + * portion N0 * log_2(P) * lat. Mono-directional links will double this + * communication cost. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * M (local input) const int + * On entry, M specifies the local number of rows of sub(A). + * + * N (local input) const int + * On entry, N specifies the local number of columns of sub(A). + * + * ICOFF (global input) const int + * On entry, ICOFF specifies the row and column offset of sub(A) + * in A. + * + * WORK (local workspace) double * + * On entry, WORK is a workarray of size at least 2*(4+2*N0). + * + * --------------------------------------------------------------------- + */ + + double *A, *Aptr, *L1, *L1ptr; + int curr, ii, ioff, jb, jj, lda, m, n, n0, nb, nbdiv, nbmin; + + if(N <= (nbmin = PANEL->algo->nbmin)) { + PANEL->algo->pffun(PANEL, + M, + N, + ICOFF, + WORK, + thread_rank, + thread_size, + max_value, + max_index); + return; + } + /* + * Find new recursive blocking factor. To avoid an infinite loop, one + * must guarantee: 1 <= jb < N, knowing that N is greater than NBMIN. + * First, we compute nblocks: the number of blocks of size NBMIN in N, + * including the last one that may be smaller. nblocks is thus larger + * than or equal to one, since N >= NBMIN. + * The ratio ( nblocks + NDIV - 1 ) / NDIV is thus larger than or equal + * to one as well. For NDIV >= 2, we are guaranteed that the quan- + * tity ( ( nblocks + NDIV - 1 ) / NDIV ) * NBMIN is less than N and + * greater than or equal to NBMIN. + */ + nbdiv = PANEL->algo->nbdiv; + ii = jj = 0; + m = M; + n = N; + nb = jb = ((((N + nbmin - 1) / nbmin) + nbdiv - 1) / nbdiv) * nbmin; + + A = PANEL->A; + lda = PANEL->lda; + L1 = PANEL->L1; + n0 = PANEL->jb; + L1ptr = Mptr(L1, ICOFF, ICOFF, n0); + curr = (int)(PANEL->grid->myrow == PANEL->prow); + + if(curr != 0) + Aptr = Mptr(A, ICOFF, ICOFF, lda); + else + Aptr = Mptr(A, 0, ICOFF, lda); + /* + * The triangular solve is replicated in every process row. The panel + * factorization is such that the first rows of A are accumulated in + * every process row during the (panel) swapping phase. We ensure this + * way a minimum amount of communication during the entire panel facto- + * rization. + */ + do { + n -= jb; + ioff = ICOFF + jj; + /* + * Factor current panel - Replicated solve - Local update + */ + HPL_pdrpanrlT(PANEL, + m, + jb, + ioff, + WORK, + thread_rank, + thread_size, + max_value, + max_index); + + if(thread_rank == 0) { + HPL_dtrsm(HplColumnMajor, + HplRight, + HplUpper, + HplNoTrans, + HplUnit, + n, + jb, + HPL_rone, + Mptr(L1ptr, jj, jj, n0), + n0, + Mptr(L1ptr, jj + jb, jj, n0), + n0); + } + if(curr != 0) { + ii += jb; + m -= jb; + } + +#pragma omp barrier + + HPL_dgemm_omp(HplColumnMajor, + HplNoTrans, + HplTrans, + m, + n, + jb, + -HPL_rone, + Mptr(Aptr, ii, jj, lda), + lda, + Mptr(L1ptr, jj + jb, jj, n0), + n0, + HPL_rone, + Mptr(Aptr, ii, jj + jb, lda), + lda, + PANEL->nb, + (curr != 0) ? ICOFF + ii : 0, + thread_rank, + thread_size); + + /* + * Copy back upper part of A in current process row - Go the next block + */ + if(curr != 0) { + if(thread_rank == 0) { + HPL_dlatcpy( + ioff, jb, Mptr(L1, ioff, 0, n0), n0, Mptr(A, 0, ioff, lda), lda); + } + } + jj += jb; + jb = Mmin(n, nb); + + } while(n > 0); +} diff --git a/src/pgesv/HPL_pdgesv.cpp b/src/pgesv/HPL_pdgesv.cpp new file mode 100644 index 0000000..ef0b0cd --- /dev/null +++ b/src/pgesv/HPL_pdgesv.cpp @@ -0,0 +1,409 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdgesv(HPL_T_grid* GRID, HPL_T_palg* ALGO, HPL_T_pmat* A) { + /* + * Purpose + * ======= + * + * HPL_pdgesv factors a N+1-by-N matrix using LU factorization with row + * partial pivoting. The main algorithm is the "right looking" variant + * with or without look-ahead. The lower triangular factor is left + * unpivoted and the pivots are not returned. The right hand side is the + * N+1 column of the coefficient matrix. + * + * Arguments + * ========= + * + * GRID (local input) HPL_T_grid * + * On entry, GRID points to the data structure containing the + * process grid information. + * + * ALGO (global input) HPL_T_palg * + * On entry, ALGO points to the data structure containing the + * algorithmic parameters. + * + * A (local input/output) HPL_T_pmat * + * On entry, A points to the data structure containing the local + * array information. + * + * --------------------------------------------------------------------- + */ + + if(A->n <= 0) return; + + A->info = 0; + + HPL_T_panel * p, **panel = NULL; + HPL_T_UPD_FUN HPL_pdupdate; + int N, icurcol = 0, j, jb, jj = 0, jstart, k, mycol, n, nb, nn, npcol, nq, + tag = MSGID_BEGIN_FACT, test; +#ifdef HPL_PROGRESS_REPORT + double start_time, time, step_time, gflops, step_gflops; +#endif + + // depth = ALGO->depth; + const int depth = 1; // NC: Hardcoded now + + mycol = GRID->mycol; + npcol = GRID->npcol; + HPL_pdupdate = ALGO->upfun; + N = A->n; + nb = A->nb; + + if(N <= 0) return; + +#ifdef HPL_PROGRESS_REPORT + start_time = HPL_ptimer_walltime(); +#endif + + /* + * Allocate a panel list of length depth + 1 (depth >= 1) + */ + panel = (HPL_T_panel**)malloc((size_t)(depth + 1) * sizeof(HPL_T_panel*)); + if(panel == NULL) { + HPL_pabort(__LINE__, "HPL_pdgesvK2", "Memory allocation failed"); + } + /* + * Create and initialize the first panel + */ + nq = HPL_numroc(N + 1, nb, nb, mycol, 0, npcol); + nn = N; + jstart = 0; + + jb = Mmin(nn, nb); + HPL_pdpanel_new( + GRID, ALGO, nn, nn + 1, jb, A, jstart, jstart, tag, &panel[0]); + nn -= jb; + jstart += jb; + if(mycol == icurcol) { + jj += jb; + nq -= jb; + } + icurcol = MModAdd1(icurcol, npcol); + tag = MNxtMgid(tag, MSGID_BEGIN_FACT, MSGID_END_FACT); + + /* + * Create second panel + */ + HPL_pdpanel_new( + GRID, ALGO, nn, nn + 1, Mmin(nn, nb), A, jstart, jstart, tag, &panel[1]); + tag = MNxtMgid(tag, MSGID_BEGIN_FACT, MSGID_END_FACT); + + /* + * Initialize the lookahead - Factor jstart columns: panel[0] + */ + jb = jstart; + jb = Mmin(jb, nb); + /* + * Factor and broadcast 0-th panel + */ + HPL_pdpanel_SendToHost(panel[0]); + HPL_pdpanel_Wait(panel[0]); + + HPL_pdfact(panel[0]); + + // send the panel back to device before bcast + HPL_pdpanel_SendToDevice(panel[0]); + HPL_pdpanel_Wait(panel[0]); + + HPL_pdpanel_bcast(panel[0]); + + // start Ubcast+row swapping for second part of A + HPL_pdlaswp_start(panel[0], HPL_UPD_2); + + if(mycol == icurcol) { + // start Ubcast+row swapping for look ahead + HPL_pdlaswp_start(panel[0], HPL_LOOK_AHEAD); + } + + // start Ubcast+row swapping for first part of A + HPL_pdlaswp_start(panel[0], HPL_UPD_1); + + // Ubcast+row swaps for second part of A + HPL_pdlaswp_exchange(panel[0], HPL_UPD_2); + + if(mycol == icurcol) { + // Ubcast+row swaps for look ahead + // nn = HPL_numrocI(jb, j, nb, nb, mycol, 0, npcol); + HPL_pdlaswp_exchange(panel[0], HPL_LOOK_AHEAD); + } + + double stepStart, stepEnd; + +#ifdef HPL_PROGRESS_REPORT +#ifdef HPL_DETAILED_TIMING + float smallDgemmTime, largeDgemm1Time, largeDgemm2Time; + double smallDgemmGflops, largeDgemm1Gflops, largeDgemm2Gflops; + + if(GRID->myrow == 0 && mycol == 0) { + printf("-------------------------------------------------------------------" + "-------------------------------------------------------------------" + "---------------------------------------------\n"); + printf(" %% | Column | Step Time (s) || DGEMM GFLOPS " + " || Panel Copy(s) | pdfact (s) | pmxswp (s) | Lbcast (s) | laswp " + "(s) | GPU Sync (s) | Step GFLOPS | Overall GFLOPS\n"); + printf(" | | | Small | First | Second " + " | | | | | " + " | | | \n"); + printf("-------------------------------------------------------------------" + "-------------------------------------------------------------------" + "---------------------------------------------\n"); + } +#else + if(GRID->myrow == 0 && mycol == 0) { + printf("---------------------------------------------------\n"); + printf(" %% | Column | Step Time (s) | Overall GFLOPS\n"); + printf(" | | | \n"); + printf("---------------------------------------------------\n"); + } +#endif +#endif + + /* + * Main loop over the remaining columns of A + */ + for(j = jstart; j < N; j += nb) { + HPL_ptimer_stepReset(HPL_TIMING_N, HPL_TIMING_BEG); + + stepStart = MPI_Wtime(); + n = N - j; + jb = Mmin(n, nb); + /* + * Initialize current panel - Finish latest update, Factor and broadcast + * current panel + */ + (void)HPL_pdpanel_free(panel[1]); + HPL_pdpanel_init(GRID, ALGO, n, n + 1, jb, A, j, j, tag, panel[1]); + + if(mycol == icurcol) { + /* update look ahead */ + HPL_pdlaswp_end(panel[0], HPL_LOOK_AHEAD); + HPL_pdupdate(panel[0], HPL_LOOK_AHEAD); + + // when the look ahead update is finished, copy back the current panel + hipStreamWaitEvent(dataStream, update[HPL_LOOK_AHEAD], 0); + HPL_pdpanel_SendToHost(panel[1]); + + /* Queue up finishing the second section */ + HPL_pdlaswp_end(panel[0], HPL_UPD_2); + HPL_pdupdate(panel[0], HPL_UPD_2); + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_UPDATE); + hipEventSynchronize(update[HPL_LOOK_AHEAD]); + HPL_ptimer(HPL_TIMING_UPDATE); +#endif + + // wait for the panel to arrive + HPL_pdpanel_Wait(panel[0]); + +#ifdef HPL_PROGRESS_REPORT +#ifdef HPL_DETAILED_TIMING + const int curr = (panel[0]->grid->myrow == panel[0]->prow ? 1 : 0); + const int mp = panel[0]->mp - (curr != 0 ? jb : 0); + + // compute the GFLOPs of the look ahead update DGEMM + hipEventElapsedTime(&smallDgemmTime, + dgemmStart[HPL_LOOK_AHEAD], + dgemmStop[HPL_LOOK_AHEAD]); + smallDgemmGflops = + (2.0 * mp * jb * jb) / (1000.0 * 1000.0 * smallDgemmTime); +#endif +#endif + + /*Panel factorization FLOP count is (2/3)NB^3 - (1/2)NB^2 - (1/6)NB + + * (N-i*NB)(NB^2-NB)*/ + HPL_pdfact(panel[1]); /* factor current panel */ + + // send the panel back to device before bcast + HPL_pdpanel_SendToDevice(panel[1]); + HPL_pdpanel_Wait(panel[0]); + } else { + /* Queue up finishing the second section */ + HPL_pdlaswp_end(panel[0], HPL_UPD_2); + HPL_pdupdate(panel[0], HPL_UPD_2); + } + + /* broadcast current panel */ + HPL_pdpanel_bcast(panel[1]); + + // start Ubcast+row swapping for second part of A + HPL_pdlaswp_start(panel[1], HPL_UPD_2); + + // while the second section is updating, exchange the rows from the first + // section + HPL_pdlaswp_exchange(panel[0], HPL_UPD_1); + + /* Queue up finishing the first section */ + HPL_pdlaswp_end(panel[0], HPL_UPD_1); + HPL_pdupdate(panel[0], HPL_UPD_1); + + if(mycol == icurcol) { + jj += jb; + nq -= jb; + } + icurcol = MModAdd1(icurcol, npcol); + tag = MNxtMgid(tag, MSGID_BEGIN_FACT, MSGID_END_FACT); + + if(mycol == icurcol) { + // prep the row swaps for the next look ahead + // nn = HPL_numrocI(jb, j+nb, nb, nb, mycol, 0, npcol); + HPL_pdlaswp_start(panel[1], HPL_LOOK_AHEAD); + + // start Ubcast+row swapping for first part of A + HPL_pdlaswp_start(panel[1], HPL_UPD_1); + + HPL_pdlaswp_exchange(panel[1], HPL_UPD_2); + + HPL_pdlaswp_exchange(panel[1], HPL_LOOK_AHEAD); + } else { + // start Ubcast+row swapping for first part of A + HPL_pdlaswp_start(panel[1], HPL_UPD_1); + + HPL_pdlaswp_exchange(panel[1], HPL_UPD_2); + } + + // wait here for the updates to compete +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_UPDATE); +#endif + hipDeviceSynchronize(); +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_UPDATE); +#endif + + stepEnd = MPI_Wtime(); + +#ifdef HPL_PROGRESS_REPORT +#ifdef HPL_DETAILED_TIMING + const int curr = (panel[0]->grid->myrow == panel[0]->prow ? 1 : 0); + const int mp = panel[0]->mp - (curr != 0 ? jb : 0); + + largeDgemm1Time = 0.0; + largeDgemm2Time = 0.0; + if(panel[0]->nu1) { + hipEventElapsedTime( + &largeDgemm1Time, dgemmStart[HPL_UPD_1], dgemmStop[HPL_UPD_1]); + largeDgemm1Gflops = (2.0 * mp * jb * (panel[0]->nu1)) / + (1000.0 * 1000.0 * (largeDgemm1Time)); + } + if(panel[0]->nu2) { + hipEventElapsedTime( + &largeDgemm2Time, dgemmStart[HPL_UPD_2], dgemmStop[HPL_UPD_2]); + largeDgemm2Gflops = (2.0 * mp * jb * (panel[0]->nu2)) / + (1000.0 * 1000.0 * (largeDgemm2Time)); + } +#endif + /* if this is process 0,0 and not the first panel */ + if(GRID->myrow == 0 && mycol == 0 && j > 0) { + time = HPL_ptimer_walltime() - start_time; + step_time = stepEnd - stepStart; + /* + Step FLOP count is (2/3)NB^3 - (1/2)NB^2 - (1/6)NB + + 2*n*NB^2 - n*NB + 2*NB*n^2 + + Overall FLOP count is (2/3)(N^3-n^3) - (1/2)(N^2-n^2) - (1/6)(N-n) + */ + step_gflops = + ((2.0 / 3.0) * jb * jb * jb - (1.0 / 2.0) * jb * jb - + (1.0 / 6.0) * jb + 2.0 * n * jb * jb - jb * n + 2.0 * jb * n * n) / + (step_time > 0.0 ? step_time : 1.e-6) / 1.e9; + gflops = ((2.0 / 3.0) * (N * (double)N * N - n * (double)n * n) - + (1.0 / 2.0) * (N * (double)N - n * (double)n) - + (1.0 / 6.0) * ((double)N - (double)n)) / + (time > 0.0 ? time : 1.e-6) / 1.e9; + printf("%5.1f%% | %09d | ", j * 100.0 / N, j); + printf(" %9.7f |", stepEnd - stepStart); + +#ifdef HPL_DETAILED_TIMING + if(panel[0]->nu0) { + printf(" %9.3e|", smallDgemmGflops); + } else { + printf(" |"); + } + if(panel[0]->nu2) { + printf(" %9.3e|", largeDgemm2Gflops); + } else { + printf(" |"); + } + + if(panel[0]->nu1) { + printf(" %9.3e|", largeDgemm1Gflops); + } else { + printf(" |"); + } + + if(panel[0]->nu0) { + printf(" %9.3e | %9.3e | %9.3e |", + HPL_ptimer_getStep(HPL_TIMING_COPY), + HPL_ptimer_getStep(HPL_TIMING_RPFACT), + HPL_ptimer_getStep(HPL_TIMING_MXSWP)); + } else { + printf(" | | |"); + } + + printf(" %9.3e | %9.3e | %9.3e |", + HPL_ptimer_getStep(HPL_TIMING_LBCAST), + HPL_ptimer_getStep(HPL_TIMING_LASWP), + HPL_ptimer_getStep(HPL_TIMING_UPDATE)); + + printf(" %9.3e |", step_gflops); +#endif + + printf(" %9.3e \n", gflops); + } +#endif + + /* + * Circular of the panel pointers: + * xtmp = x[0]; for( k=0; k < 1; k++ ) x[k] = x[k+1]; x[d] = xtmp; + * + * Go to next process row and column - update the message ids for broadcast + */ + p = panel[0]; + panel[0] = panel[1]; + panel[1] = p; + } + /* + * Clean-up: Finish updates - release panels and panel list + */ + // nn = HPL_numrocI(1, N, nb, nb, mycol, 0, npcol); + HPL_pdlaswp_end(panel[0], HPL_LOOK_AHEAD); + HPL_pdupdate(panel[0], HPL_LOOK_AHEAD); + + HPL_pdlaswp_end(panel[0], HPL_UPD_2); + HPL_pdupdate(panel[0], HPL_UPD_2); + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_UPDATE); +#endif + hipDeviceSynchronize(); +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_UPDATE); +#endif + + HPL_pdpanel_disp(&panel[0]); + HPL_pdpanel_disp(&panel[1]); + if(panel) free(panel); + + /* + * Solve upper triangular system + */ + if(A->info == 0) HPL_pdtrsv(GRID, A); +} diff --git a/src/pgesv/HPL_pdlaswp.cpp b/src/pgesv/HPL_pdlaswp.cpp new file mode 100644 index 0000000..f35113b --- /dev/null +++ b/src/pgesv/HPL_pdlaswp.cpp @@ -0,0 +1,532 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdlaswp_start(HPL_T_panel* PANEL, const HPL_T_UPD UPD) { + /* + * Purpose + * ======= + * + * HPL_pdlaswp_start begins the NB row interchanges to NN columns of the + * trailing submatrix and broadcast a column panel. The rows needed for + * the row interchanges are packed into U (in the current row) and W + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * --------------------------------------------------------------------- + */ + /* + * .. Local Variables .. + */ + double *U, *W; + double *dA, *dU, *dW; + int * ipID, *iplen, *ipcounts, *ipoffsets, *iwork, + *lindxU = NULL, *lindxA = NULL, *lindxAU, *permU; + int *dlindxU = NULL, *dlindxA = NULL, *dlindxAU, *dpermU, *dpermU_ex; + int icurrow, *iflag, *ipA, *ipl, jb, k, lda, myrow, n, nprow, LDU, LDW; + + /* .. + * .. Executable Statements .. + */ + n = PANEL->n; + jb = PANEL->jb; + + /* + * Retrieve parameters from the PANEL data structure + */ + nprow = PANEL->grid->nprow; + myrow = PANEL->grid->myrow; + iflag = PANEL->IWORK; + + MPI_Comm comm = PANEL->grid->col_comm; + + // quick return if we're 1xQ + if(nprow == 1) return; + + dA = PANEL->dA; + lda = PANEL->dlda; + icurrow = PANEL->prow; + + if(UPD == HPL_LOOK_AHEAD) { + U = PANEL->U; + W = PANEL->W; + dU = PANEL->dU; + dW = PANEL->dW; + LDU = PANEL->ldu0; + LDW = PANEL->ldu0; + n = PANEL->nu0; + + } else if(UPD == HPL_UPD_1) { + U = PANEL->U1; + W = PANEL->W1; + dU = PANEL->dU1; + dW = PANEL->dW1; + LDU = PANEL->ldu1; + LDW = PANEL->ldu1; + n = PANEL->nu1; + // we call the row swap start before the first section is updated + // so shift the pointers + dA = Mptr(dA, 0, PANEL->nu0, lda); + + } else if(UPD == HPL_UPD_2) { + U = PANEL->U2; + W = PANEL->W2; + dU = PANEL->dU2; + dW = PANEL->dW2; + LDU = PANEL->ldu2; + LDW = PANEL->ldu2; + n = PANEL->nu2; + // we call the row swap start before the first section is updated + // so shift the pointers + dA = Mptr(dA, 0, PANEL->nu0 + PANEL->nu1, lda); + } + + /* + * Quick return if there is nothing to do + */ + if((n <= 0) || (jb <= 0)) return; + + /* + * Compute ipID (if not already done for this panel). lindxA and lindxAU + * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1 + * are of size nprow, permU is of length jb, and this function needs a + * workspace of size max( 2 * jb (plindx1), nprow+1(equil)): + * 1(iflag) + 1(ipl) + 1(ipA) + 9*jb + 3*nprow + 1 + MAX(2*jb,nprow+1) + * i.e. 4 + 9*jb + 3*nprow + max(2*jb, nprow+1); + */ + k = (int)((unsigned int)(jb) << 1); + ipl = iflag + 1; + ipID = ipl + 1; + ipA = ipID + ((unsigned int)(k) << 1); + iplen = ipA + 1; + ipcounts = iplen + nprow + 1; + ipoffsets = ipcounts + nprow; + iwork = ipoffsets + nprow; + + lindxU = PANEL->lindxU; + lindxA = PANEL->lindxA; + lindxAU = PANEL->lindxAU; + permU = PANEL->permU; + + dlindxU = PANEL->dlindxU; + dlindxA = PANEL->dlindxA; + dlindxAU = PANEL->dlindxAU; + dpermU = PANEL->dpermU; + dpermU_ex = dpermU + jb; + + if(*iflag == -1) /* no index arrays have been computed so far */ + { + // get the ipivs on the host after the Bcast + if(PANEL->grid->mycol != PANEL->pcol) { + hipMemcpy2DAsync(PANEL->ipiv, + PANEL->jb * sizeof(int), + PANEL->dipiv, + PANEL->jb * sizeof(int), + PANEL->jb * sizeof(int), + 1, + hipMemcpyDeviceToHost, + dataStream); + } + hipStreamSynchronize(dataStream); + + // compute spreading info + HPL_pipid(PANEL, ipl, ipID); + HPL_plindx( + PANEL, *ipl, ipID, ipA, lindxU, lindxAU, lindxA, iplen, permU, iwork); + *iflag = 1; + } + + /* + * For i in [0..2*jb), lindxA[i] is the offset in A of a row that ulti- + * mately goes to U( :, lindxAU[i] ). In each rank, we directly pack + * into U, otherwise we pack into workspace. The first + * entry of each column packed in workspace is in fact the row or column + * offset in U where it should go to. + */ + if(myrow == icurrow) { + // copy needed rows of A into U + HPL_dlaswp01T(jb, n, dA, lda, dU, LDU, dlindxU); + } else { + // copy needed rows from A into U(:, iplen[myrow]) + HPL_dlaswp03T(iplen[myrow + 1] - iplen[myrow], + n, + dA, + lda, + Mptr(dU, 0, iplen[myrow], LDU), + LDU, + dlindxU); + } + + // record when packing completes + hipEventRecord(swapStartEvent[UPD], computeStream); + + /* + * End of HPL_pdlaswp_start + */ +} + +void HPL_pdlaswp_exchange(HPL_T_panel* PANEL, const HPL_T_UPD UPD) { + /* + * Purpose + * ======= + * + * HPL_pdlaswp_exchange applies the NB row interchanges to NN columns of + * the trailing submatrix and broadcast a column panel. + * + * A "Spread then roll" algorithm performs the swap :: broadcast of the + * row panel U at once, resulting in a minimal communication volume and + * a "very good" use of the connectivity if available. With P process + * rows and assuming bi-directional links, the running time of this + * function can be approximated by: + * + * (log_2(P)+(P-1)) * lat + K * NB * LocQ(N) / bdwth + * + * where NB is the number of rows of the row panel U, N is the global + * number of columns being updated, lat and bdwth are the latency and + * bandwidth of the network for double precision real words. K is + * a constant in (2,3] that depends on the achieved bandwidth during a + * simultaneous message exchange between two processes. An empirical + * optimistic value of K is typically 2.4. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * --------------------------------------------------------------------- + */ + /* + * .. Local Variables .. + */ + double *U, *W; + double *dA, *dU, *dW; + int * ipID, *iplen, *ipcounts, *ipoffsets, *iwork, + *lindxU = NULL, *lindxA = NULL, *lindxAU, *permU; + int *dlindxU = NULL, *dlindxA = NULL, *dlindxAU, *dpermU, *dpermU_ex; + int icurrow, *iflag, *ipA, *ipl, jb, k, lda, myrow, n, nprow, LDU, LDW; + + /* .. + * .. Executable Statements .. + */ + n = PANEL->n; + jb = PANEL->jb; + + /* + * Retrieve parameters from the PANEL data structure + */ + nprow = PANEL->grid->nprow; + myrow = PANEL->grid->myrow; + iflag = PANEL->IWORK; + + MPI_Comm comm = PANEL->grid->col_comm; + + // quick return if we're 1xQ + if(nprow == 1) return; + + dA = PANEL->dA; + lda = PANEL->dlda; + icurrow = PANEL->prow; + + if(UPD == HPL_LOOK_AHEAD) { + U = PANEL->U; + W = PANEL->W; + dU = PANEL->dU; + dW = PANEL->dW; + LDU = PANEL->ldu0; + LDW = PANEL->ldu0; + n = PANEL->nu0; + + } else if(UPD == HPL_UPD_1) { + U = PANEL->U1; + W = PANEL->W1; + dU = PANEL->dU1; + dW = PANEL->dW1; + LDU = PANEL->ldu1; + LDW = PANEL->ldu1; + n = PANEL->nu1; + // we call the row swap start before the first section is updated + // so shift the pointers + dA = Mptr(dA, 0, PANEL->nu0, lda); + + } else if(UPD == HPL_UPD_2) { + U = PANEL->U2; + W = PANEL->W2; + dU = PANEL->dU2; + dW = PANEL->dW2; + LDU = PANEL->ldu2; + LDW = PANEL->ldu2; + n = PANEL->nu2; + // we call the row swap start before the first section is updated + // so shift the pointers + dA = Mptr(dA, 0, PANEL->nu0 + PANEL->nu1, lda); + } + + /* + * Quick return if there is nothing to do + */ + if((n <= 0) || (jb <= 0)) return; + + /* + * Compute ipID (if not already done for this panel). lindxA and lindxAU + * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1 + * are of size nprow, permU is of length jb, and this function needs a + * workspace of size max( 2 * jb (plindx1), nprow+1(equil)): + * 1(iflag) + 1(ipl) + 1(ipA) + 9*jb + 3*nprow + 1 + MAX(2*jb,nprow+1) + * i.e. 4 + 9*jb + 3*nprow + max(2*jb, nprow+1); + */ + k = (int)((unsigned int)(jb) << 1); + ipl = iflag + 1; + ipID = ipl + 1; + ipA = ipID + ((unsigned int)(k) << 1); + iplen = ipA + 1; + ipcounts = iplen + nprow + 1; + ipoffsets = ipcounts + nprow; + iwork = ipoffsets + nprow; + + lindxA = PANEL->lindxA; + lindxAU = PANEL->lindxAU; + lindxU = PANEL->lindxU; + permU = PANEL->permU; + + dlindxA = PANEL->dlindxA; + dlindxAU = PANEL->dlindxAU; + dlindxU = PANEL->dlindxU; + dpermU = PANEL->dpermU; + dpermU_ex = dpermU + jb; + + /* Set MPI message counts and offsets */ + ipcounts[0] = (iplen[1] - iplen[0]) * LDU; + ipoffsets[0] = 0; + + for(int i = 1; i < nprow; ++i) { + ipcounts[i] = (iplen[i + 1] - iplen[i]) * LDU; + ipoffsets[i] = ipcounts[i - 1] + ipoffsets[i - 1]; + } + ipoffsets[nprow] = ipcounts[nprow - 1] + ipoffsets[nprow - 1]; + + /* + * For i in [0..2*jb), lindxA[i] is the offset in A of a row that ulti- + * mately goes to U( :, lindxAU[i] ). In each rank, we directly pack + * into U, otherwise we pack into workspace. The first + * entry of each column packed in workspace is in fact the row or column + * offset in U where it should go to. + */ + + if(myrow == icurrow) { + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_UPDATE); +#endif + + // hipStreamSynchronize(computeStream); + hipEventSynchronize(swapStartEvent[UPD]); + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_UPDATE); + HPL_ptimer(HPL_TIMING_LASWP); +#endif + + // send rows to other ranks + HPL_scatterv(dU, ipcounts, ipoffsets, ipcounts[myrow], icurrow, comm); + + // All gather dU + HPL_allgatherv(dU, ipcounts[myrow], ipcounts, ipoffsets, comm); + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_LASWP); +#endif + + } else { + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_UPDATE); +#endif + + // wait for dU to be ready + // hipStreamSynchronize(computeStream); + hipEventSynchronize(swapStartEvent[UPD]); + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_UPDATE); + HPL_ptimer(HPL_TIMING_LASWP); +#endif + + // receive rows from icurrow into dW + HPL_scatterv(dW, ipcounts, ipoffsets, ipcounts[myrow], icurrow, comm); + + // All gather dU + HPL_allgatherv(dU, ipcounts[myrow], ipcounts, ipoffsets, comm); + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_LASWP); +#endif + } + /* + * End of HPL_pdlaswp_exchange + */ +} + +void HPL_pdlaswp_end(HPL_T_panel* PANEL, const HPL_T_UPD UPD) { + /* + * Purpose + * ======= + * + * HPL_pdlaswp_end copies scattered rows of A into an array U. The + * row offsets in A of the source rows are specified by LINDXA. The + * destination of those rows are specified by LINDXAU. A + * positive value of LINDXAU indicates that the array destination is U, + * and A otherwise. Rows of A are stored as columns in U. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * --------------------------------------------------------------------- + */ + /* + * .. Local Variables .. + */ + double *U, *W; + double *dA, *dU, *dW; + int * ipID, *iplen, *ipcounts, *ipoffsets, *iwork, *lindxA = NULL, *lindxAU, + *permU; + int *dlindxA = NULL, *dlindxAU, *dlindxU, *dpermU, *dpermU_ex; + int icurrow, *iflag, *ipA, *ipl, jb, k, lda, myrow, n, nprow, LDU, LDW; + + /* .. + * .. Executable Statements .. + */ + n = PANEL->n; + jb = PANEL->jb; + + /* + * Retrieve parameters from the PANEL data structure + */ + nprow = PANEL->grid->nprow; + myrow = PANEL->grid->myrow; + iflag = PANEL->IWORK; + + MPI_Comm comm = PANEL->grid->col_comm; + + dA = PANEL->dA; + lda = PANEL->dlda; + icurrow = PANEL->prow; + + if(UPD == HPL_LOOK_AHEAD) { + U = PANEL->U; + W = PANEL->W; + dU = PANEL->dU; + dW = PANEL->dW; + LDU = PANEL->ldu0; + LDW = PANEL->ldu0; + n = PANEL->nu0; + + } else if(UPD == HPL_UPD_1) { + U = PANEL->U1; + W = PANEL->W1; + dU = PANEL->dU1; + dW = PANEL->dW1; + LDU = PANEL->ldu1; + LDW = PANEL->ldu1; + n = PANEL->nu1; + // we call the row swap start before the first section is updated + // so shift the pointers + dA = Mptr(dA, 0, PANEL->nu0, lda); + + } else if(UPD == HPL_UPD_2) { + U = PANEL->U2; + W = PANEL->W2; + dU = PANEL->dU2; + dW = PANEL->dW2; + LDU = PANEL->ldu2; + LDW = PANEL->ldu2; + n = PANEL->nu2; + // we call the row swap start before the first section is updated + // so shift the pointers + dA = Mptr(dA, 0, PANEL->nu0 + PANEL->nu1, lda); + } + + /* + * Quick return if there is nothing to do + */ + if((n <= 0) || (jb <= 0)) return; + + // just local swaps if we're 1xQ + if(nprow == 1) { + HPL_dlaswp00N(jb, n, dA, lda, PANEL->dipiv); + return; + } + + /* + * Compute ipID (if not already done for this panel). lindxA and lindxAU + * are of length at most 2*jb - iplen is of size nprow+1, ipmap, ipmapm1 + * are of size nprow, permU is of length jb, and this function needs a + * workspace of size max( 2 * jb (plindx1), nprow+1(equil)): + * 1(iflag) + 1(ipl) + 1(ipA) + 9*jb + 3*nprow + 1 + MAX(2*jb,nprow+1) + * i.e. 4 + 9*jb + 3*nprow + max(2*jb, nprow+1); + */ + k = (int)((unsigned int)(jb) << 1); + ipl = iflag + 1; + ipID = ipl + 1; + ipA = ipID + ((unsigned int)(k) << 1); + iplen = ipA + 1; + + lindxA = PANEL->lindxA; + lindxAU = PANEL->lindxAU; + permU = PANEL->permU; + + dlindxA = PANEL->dlindxA; + dlindxAU = PANEL->dlindxAU; + dlindxU = PANEL->dlindxU; + dpermU = PANEL->dpermU; + dpermU_ex = dpermU + jb; + + /* + * For i in [0..2*jb), lindxA[i] is the offset in A of a row that ulti- + * mately goes to U( :, lindxAU[i] ). In each rank, we directly pack + * into U, otherwise we pack into workspace. The first + * entry of each column packed in workspace is in fact the row or column + * offset in U where it should go to. + */ + + if(myrow == icurrow) { + // swap rows local to A on device + HPL_dlaswp02T(*ipA, n, dA, lda, dlindxAU, dlindxA); + } else { + // Queue inserting recieved rows in W into A on device + HPL_dlaswp04T( + iplen[myrow + 1] - iplen[myrow], n, dA, lda, dW, LDW, dlindxU); + } + + /* + * Permute U in every process row + */ + HPL_dlaswp10N(n, jb, dU, LDU, dpermU); + /* + * End of HPL_pdlaswp_endT + */ +} diff --git a/src/pgesv/HPL_pdtrsv_device.cpp b/src/pgesv/HPL_pdtrsv_device.cpp new file mode 100644 index 0000000..9a01362 --- /dev/null +++ b/src/pgesv/HPL_pdtrsv_device.cpp @@ -0,0 +1,352 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +#include + +#define BLOCK_SIZE 512 +__global__ void setZero(const int N, double* __restrict__ X) { + const int t = threadIdx.x; + const int b = blockIdx.x; + const size_t id = b * BLOCK_SIZE + t; // row id + + if(id < N) { X[id] = 0.0; } +} + +void HPL_pdtrsv(HPL_T_grid* GRID, HPL_T_pmat* AMAT) { + /* + * Purpose + * ======= + * + * HPL_pdtrsv solves an upper triangular system of linear equations. + * + * The rhs is the last column of the N by N+1 matrix A. The solve starts + * in the process column owning the Nth column of A, so the rhs b may + * need to be moved one process column to the left at the beginning. The + * routine therefore needs a column vector in every process column but + * the one owning b. The result is replicated in all process rows, and + * returned in XR, i.e. XR is of size nq = LOCq( N ) in all processes. + * + * The algorithm uses decreasing one-ring broadcast in process rows and + * columns implemented in terms of synchronous communication point to + * point primitives. The lookahead of depth 1 is used to minimize the + * critical path. This entire operation is essentially ``latency'' bound + * and an estimate of its running time is given by: + * + * (move rhs) lat + N / ( P bdwth ) + + * (solve) ((N / NB)-1) 2 (lat + NB / bdwth) + + * gam2 N^2 / ( P Q ), + * + * where gam2 is an estimate of the Level 2 BLAS rate of execution. + * There are N / NB diagonal blocks. One must exchange 2 messages of + * length NB to compute the next NB entries of the vector solution, as + * well as performing a total of N^2 floating point operations. + * + * Arguments + * ========= + * + * GRID (local input) HPL_T_grid * + * On entry, GRID points to the data structure containing the + * process grid information. + * + * AMAT (local input/output) HPL_T_pmat * + * On entry, AMAT points to the data structure containing the + * local array information. + * + * --------------------------------------------------------------------- + */ + + MPI_Comm Ccomm, Rcomm; + double * Aprev = NULL, *XC = NULL, *XR = NULL, *Xd = NULL, *Xdprev = NULL, + *W = NULL; + double *dA = NULL, *dAprev = NULL, *dAptr, *dXC = NULL, *dXR = NULL, + *dXd = NULL, *dXdprev = NULL, *dW = NULL; + int Alcol, Alrow, Anpprev, Anp, Anq, Bcol, Cmsgid, GridIsNotPx1, GridIsNot1xQ, + Rmsgid, colprev, kb, kbprev, lda, mycol, myrow, n, n1, n1p, + n1pprev = 0, nb, npcol, nprow, rowprev, tmp1, tmp2; +/* .. + * .. Executable Statements .. + */ +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_PTRSV); +#endif + if((n = AMAT->n) <= 0) return; + + (void)HPL_grid_info(GRID, &nprow, &npcol, &myrow, &mycol); + Rcomm = GRID->row_comm; + Rmsgid = MSGID_BEGIN_PTRSV; + Ccomm = GRID->col_comm; + Cmsgid = MSGID_BEGIN_PTRSV + 1; + GridIsNot1xQ = (nprow > 1); + GridIsNotPx1 = (npcol > 1); + + nb = AMAT->nb; + lda = AMAT->ld; + + Mnumroc(Anp, n, nb, nb, myrow, 0, nprow); + Mnumroc(Anq, n, nb, nb, mycol, 0, npcol); + + dA = AMAT->dA; + dXR = AMAT->dX; + XR = AMAT->W + 2 * Anp; + + XC = AMAT->W; + dXC = AMAT->dW; + + W = AMAT->W + Anp; + dW = AMAT->dW + Anp; + + hipStream_t stream; + rocblas_get_stream(handle, &stream); + + /* + * Move the rhs in the process column owning the last column of A. + */ + + tmp1 = (n - 1) / nb; + Alrow = tmp1 - (tmp1 / nprow) * nprow; + Alcol = tmp1 - (tmp1 / npcol) * npcol; + kb = n - tmp1 * nb; + + dAptr = (double*)(dA); + double* dB = Mptr(dAptr, 0, Anq, lda); + + Mindxg2p(n, nb, nb, Bcol, 0, npcol); + + if(Anp > 0) { + if(Alcol != Bcol) { + if(mycol == Bcol) { + hipMemcpy(dXC, dB, Anp * sizeof(double), hipMemcpyDeviceToDevice); + (void)HPL_send(dXC, Anp, Alcol, Rmsgid, Rcomm); + } else if(mycol == Alcol) { + (void)HPL_recv(dXC, Anp, Bcol, Rmsgid, Rcomm); + } + } else { + if(mycol == Bcol) { + hipMemcpy(dXC, dB, Anp * sizeof(double), hipMemcpyDeviceToDevice); + } + } + } + + Rmsgid = (Rmsgid + 2 > MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV : Rmsgid + 2); + if(mycol != Alcol) { + if(Anp) { + size_t grid_size = (Anp + BLOCK_SIZE - 1) / BLOCK_SIZE; + setZero<<>>(Anp, dXC); + } + } + /* + * Set up lookahead + */ + n1 = (npcol - 1) * nb; + n1 = Mmax(n1, nb); + + Anpprev = Anp; + dAprev = dAptr = Mptr(dAptr, 0, Anq, lda); + Xdprev = XR; + dXdprev = dXR; + tmp1 = n - kb; + tmp1 -= (tmp2 = Mmin(tmp1, n1)); + MnumrocI(n1pprev, tmp2, Mmax(0, tmp1), nb, nb, myrow, 0, nprow); + + if(myrow == Alrow) { Anpprev = (Anp -= kb); } + if(mycol == Alcol) { + dAprev = (dAptr -= lda * kb); + Anq -= kb; + Xdprev = (Xd = XR + Anq); + dXdprev = (dXd = dXR + Anq); + if(myrow == Alrow) { + rocblas_dtrsv(handle, + rocblas_fill_upper, + rocblas_operation_none, + rocblas_diagonal_non_unit, + kb, + dAptr + Anp, + lda, + dXC + Anp, + 1); + rocblas_dcopy(handle, kb, dXC + Anp, 1, dXd, 1); + } + } + + rowprev = Alrow; + Alrow = MModSub1(Alrow, nprow); + colprev = Alcol; + Alcol = MModSub1(Alcol, npcol); + kbprev = kb; + n -= kb; + tmp1 = n - (kb = nb); + tmp1 -= (tmp2 = Mmin(tmp1, n1)); + MnumrocI(n1p, tmp2, Mmax(0, tmp1), nb, nb, myrow, 0, nprow); + /* + * Start the operations + */ + while(n > 0) { + if(mycol == Alcol) { + dAptr -= lda * kb; + Anq -= kb; + Xd = XR + Anq; + dXd = dXR + Anq; + } + if(myrow == Alrow) { Anp -= kb; } + /* + * Broadcast (decreasing-ring) of previous solution block in previous + * process column, compute partial update of current block and send it + * to current process column. + */ + if(mycol == colprev) { + /* + * Send previous solution block in process row above + */ + if(myrow == rowprev) { + if(GridIsNot1xQ) { + if(kbprev) { + hipDeviceSynchronize(); + (void)HPL_send( + dXdprev, kbprev, MModSub1(myrow, nprow), Cmsgid, Ccomm); + } + } + } else { + if(kbprev) { + (void)HPL_recv( + dXdprev, kbprev, MModAdd1(myrow, nprow), Cmsgid, Ccomm); + } + } + /* + * Compute partial update of previous solution block and send it to cur- + * rent column + */ + if(n1pprev > 0) { + tmp1 = Anpprev - n1pprev; + const double one = 1.0; + const double mone = -1.0; + rocblas_dgemv(handle, + rocblas_operation_none, + n1pprev, + kbprev, + &mone, + dAprev + tmp1, + lda, + dXdprev, + 1, + &one, + dXC + tmp1, + 1); + if(GridIsNotPx1) { + if(n1pprev) { + hipDeviceSynchronize(); + (void)HPL_send(dXC + tmp1, n1pprev, Alcol, Rmsgid, Rcomm); + } + } + } + /* + * Finish the (decreasing-ring) broadcast of the solution block in pre- + * vious process column + */ + if((myrow != rowprev) && (myrow != MModAdd1(rowprev, nprow))) { + if(kbprev) { + hipDeviceSynchronize(); + (void)HPL_send( + dXdprev, kbprev, MModSub1(myrow, nprow), Cmsgid, Ccomm); + } + } + } else if(mycol == Alcol) { + /* + * Current column receives and accumulates partial update of previous + * solution block + */ + if(n1pprev > 0) { + if(n1pprev) { + (void)HPL_recv(dW, n1pprev, colprev, Rmsgid, Rcomm); + const double one = 1.0; + rocblas_daxpy( + handle, n1pprev, &one, dW, 1, dXC + Anpprev - n1pprev, 1); + } + } + } + /* + * Solve current diagonal block + */ + if((mycol == Alcol) && (myrow == Alrow)) { + rocblas_dtrsv(handle, + rocblas_fill_upper, + rocblas_operation_none, + rocblas_diagonal_non_unit, + kb, + dAptr + Anp, + lda, + dXC + Anp, + 1); + rocblas_dcopy(handle, kb, dXC + Anp, 1, dXR + Anq, 1); + } + /* + * Finish previous update + */ + if((mycol == colprev) && ((tmp1 = Anpprev - n1pprev) > 0)) { + const double one = 1.0; + const double mone = -1.0; + rocblas_dgemv(handle, + rocblas_operation_none, + tmp1, + kbprev, + &mone, + dAprev, + lda, + dXdprev, + 1, + &one, + dXC, + 1); + } + /* + * Save info of current step and update info for the next step + */ + if(mycol == Alcol) { + dAprev = dAptr; + Xdprev = Xd; + dXdprev = dXd; + } + if(myrow == Alrow) { Anpprev -= kb; } + + rowprev = Alrow; + colprev = Alcol; + n1pprev = n1p; + kbprev = kb; + n -= kb; + Alrow = MModSub1(Alrow, nprow); + Alcol = MModSub1(Alcol, npcol); + tmp1 = n - (kb = nb); + tmp1 -= (tmp2 = Mmin(tmp1, n1)); + MnumrocI(n1p, tmp2, Mmax(0, tmp1), nb, nb, myrow, 0, nprow); + + Rmsgid = (Rmsgid + 2 > MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV : Rmsgid + 2); + Cmsgid = + (Cmsgid + 2 > MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV + 1 : Cmsgid + 2); + } + /* + * Replicate last solution block + */ + if(mycol == colprev) { + if(kbprev) { + hipDeviceSynchronize(); + (void)HPL_broadcast((void*)(dXR), kbprev, HPL_DOUBLE, rowprev, Ccomm); + } + } + +#ifdef HPL_DETAILED_TIMING + HPL_ptimer(HPL_TIMING_PTRSV); +#endif +} diff --git a/src/pgesv/HPL_pdupdateNT.cpp b/src/pgesv/HPL_pdupdateNT.cpp new file mode 100644 index 0000000..be2e98b --- /dev/null +++ b/src/pgesv/HPL_pdupdateNT.cpp @@ -0,0 +1,169 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdupdateNT(HPL_T_panel* PANEL, const HPL_T_UPD UPD) { + /* + * Purpose + * ======= + * + * HPL_pdupdateNT applies the row interchanges and updates part of the + * trailing (using the panel PANEL) submatrix. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel (to be updated) information. + * + * --------------------------------------------------------------------- + */ + + double *Aptr, *L1ptr, *L2ptr, *Uptr, *dpiv; + int* dipiv; + + int curr, i, jb, lda, ldl2, LDU, mp, n, nb; + + /* .. + * .. Executable Statements .. + */ + nb = PANEL->nb; + jb = PANEL->jb; + n = PANEL->nq; + lda = PANEL->dlda; + Aptr = PANEL->dA; + + if(UPD == HPL_LOOK_AHEAD) { + Uptr = PANEL->dU; + LDU = PANEL->ldu0; + n = Mmin(PANEL->nu0, n); + } else if(UPD == HPL_UPD_1) { + Uptr = PANEL->dU1; + LDU = PANEL->ldu1; + n = Mmin(PANEL->nu1, n); + // we call the row swap start before the first section is updated + // so shift the pointers + Aptr = Mptr(Aptr, 0, PANEL->nu0, lda); + } else if(UPD == HPL_UPD_2) { + Uptr = PANEL->dU2; + LDU = PANEL->ldu2; + n = Mmin(PANEL->nu2, n); + // we call the row swap start before the first section is updated + // so shift the pointers + Aptr = Mptr(Aptr, 0, PANEL->nu0 + PANEL->nu1, lda); + } + + /* + * There is nothing to update, enforce the panel broadcast. + */ + if((n <= 0) || (jb <= 0)) { return; } + + hipStream_t stream; + rocblas_get_stream(handle, &stream); + + curr = (PANEL->grid->myrow == PANEL->prow ? 1 : 0); + L2ptr = PANEL->dL2; + L1ptr = PANEL->dL1; + ldl2 = PANEL->dldl2; + mp = PANEL->mp - (curr != 0 ? jb : 0); + + const double one = 1.0; + const double mone = -1.0; + + /* + * Update + */ + if(PANEL->grid->nprow == 1) { + /* + * 1 x Q case + */ + rocblas_dtrsm(handle, + rocblas_side_left, + rocblas_fill_lower, + rocblas_operation_none, + rocblas_diagonal_unit, + jb, + n, + &one, + L1ptr, + jb, + Aptr, + lda); + + HPL_dlatcpy_gpu(n, jb, Aptr, lda, Uptr, LDU); + } else { + /* + * Compute redundantly row block of U and update trailing submatrix + */ + rocblas_dtrsm(handle, + rocblas_side_right, + rocblas_fill_lower, + rocblas_operation_transpose, + rocblas_diagonal_unit, + n, + jb, + &one, + L1ptr, + jb, + Uptr, + LDU); + } + + /* + * Queue finishing the update + */ + if(curr != 0) { + hipEventRecord(dgemmStart[UPD], stream); + rocblas_dgemm(handle, + rocblas_operation_none, + rocblas_operation_transpose, + mp, + n, + jb, + &mone, + L2ptr, + ldl2, + Uptr, + LDU, + &one, + Mptr(Aptr, jb, 0, lda), + lda); + hipEventRecord(dgemmStop[UPD], stream); + + if(PANEL->grid->nprow > 1) HPL_dlatcpy_gpu(jb, n, Uptr, LDU, Aptr, lda); + } else { + hipEventRecord(dgemmStart[UPD], stream); + rocblas_dgemm(handle, + rocblas_operation_none, + rocblas_operation_transpose, + mp, + n, + jb, + &mone, + L2ptr, + ldl2, + Uptr, + LDU, + &one, + Aptr, + lda); + hipEventRecord(dgemmStop[UPD], stream); + } + + hipEventRecord(update[UPD], stream); +} diff --git a/src/pgesv/HPL_pdupdateTT.cpp b/src/pgesv/HPL_pdupdateTT.cpp new file mode 100644 index 0000000..bac3101 --- /dev/null +++ b/src/pgesv/HPL_pdupdateTT.cpp @@ -0,0 +1,168 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pdupdateTT(HPL_T_panel* PANEL, const HPL_T_UPD UPD) { + /* + * Purpose + * ======= + * + * HPL_pdupdateNT applies the row interchanges and updates part of the + * trailing (using the panel PANEL) submatrix. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel (to be updated) information. + * + * --------------------------------------------------------------------- + */ + + double *Aptr, *L1ptr, *L2ptr, *Uptr, *dpiv; + int* dipiv; + + int curr, i, iroff, jb, lda, ldl2, LDU, mp, n, nb; + + /* .. + * .. Executable Statements .. + */ + nb = PANEL->nb; + jb = PANEL->jb; + n = PANEL->nq; + lda = PANEL->dlda; + Aptr = PANEL->dA; + + if(UPD == HPL_LOOK_AHEAD) { + Uptr = PANEL->dU; + LDU = PANEL->ldu0; + n = Mmin(PANEL->nu0, n); + } else if(UPD == HPL_UPD_1) { + Uptr = PANEL->dU1; + LDU = PANEL->ldu1; + n = Mmin(PANEL->nu1, n); + // we call the row swap start before the first section is updated + // so shift the pointers + Aptr = Mptr(Aptr, 0, PANEL->nu0, lda); + } else if(UPD == HPL_UPD_2) { + Uptr = PANEL->dU2; + LDU = PANEL->ldu2; + n = Mmin(PANEL->nu2, n); + // we call the row swap start before the first section is updated + // so shift the pointers + Aptr = Mptr(Aptr, 0, PANEL->nu0 + PANEL->nu1, lda); + } + + /* + * There is nothing to update, enforce the panel broadcast. + */ + if((n <= 0) || (jb <= 0)) { return; } + + hipStream_t stream; + rocblas_get_stream(handle, &stream); + + curr = (PANEL->grid->myrow == PANEL->prow ? 1 : 0); + L2ptr = PANEL->dL2; + L1ptr = PANEL->dL1; + ldl2 = PANEL->dldl2; + mp = PANEL->mp - (curr != 0 ? jb : 0); + + const double one = 1.0; + const double mone = -1.0; + + /* + * Update + */ + if(PANEL->grid->nprow == 1) { + /* + * 1 x Q case + */ + rocblas_dtrsm(handle, + rocblas_side_left, + rocblas_fill_upper, + rocblas_operation_transpose, + rocblas_diagonal_unit, + jb, + n, + &one, + L1ptr, + jb, + Aptr, + lda); + HPL_dlatcpy_gpu(n, jb, Aptr, lda, Uptr, LDU); + } else { + /* + * Compute redundantly row block of U and update trailing submatrix + */ + rocblas_dtrsm(handle, + rocblas_side_right, + rocblas_fill_upper, + rocblas_operation_none, + rocblas_diagonal_unit, + n, + jb, + &one, + L1ptr, + jb, + Uptr, + LDU); + } + + /* + * Queue finishing the update + */ + if(curr != 0) { + hipEventRecord(dgemmStart[UPD], stream); + rocblas_dgemm(handle, + rocblas_operation_none, + rocblas_operation_transpose, + mp, + n, + jb, + &mone, + L2ptr, + ldl2, + Uptr, + LDU, + &one, + Mptr(Aptr, jb, 0, lda), + lda); + hipEventRecord(dgemmStop[UPD], stream); + + if(PANEL->grid->nprow > 1) HPL_dlatcpy_gpu(jb, n, Uptr, LDU, Aptr, lda); + } else { + hipEventRecord(dgemmStart[UPD], stream); + rocblas_dgemm(handle, + rocblas_operation_none, + rocblas_operation_transpose, + mp, + n, + jb, + &mone, + L2ptr, + ldl2, + Uptr, + LDU, + &one, + Aptr, + lda); + hipEventRecord(dgemmStop[UPD], stream); + } + + hipEventRecord(update[UPD], stream); +} diff --git a/src/pgesv/HPL_perm.cpp b/src/pgesv/HPL_perm.cpp new file mode 100644 index 0000000..3d2fab2 --- /dev/null +++ b/src/pgesv/HPL_perm.cpp @@ -0,0 +1,89 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_perm(const int N, int* LINDXA, int* LINDXAU, int* IWORK) { + /* + * Purpose + * ======= + * + * HPL_perm combines two index arrays and generate the corresponding + * permutation. First, this function computes the inverse of LINDXA, and + * then combine it with LINDXAU. Second, in order to be able to perform + * the permutation in place, LINDXAU is overwritten by the sequence of + * permutation producing the same result. What we ultimately want to + * achieve is: U[LINDXAU[i]] := U[LINDXA[i]] for i in [0..N). After the + * call to this function, this in place permutation can be performed by + * for i in [0..N) swap U[i] with U[LINDXAU[i]]. + * + * Arguments + * ========= + * + * N (global input) const int + * On entry, N specifies the length of the arrays LINDXA and + * LINDXAU. N should be at least zero. + * + * LINDXA (global input/output) int * + * On entry, LINDXA is an array of dimension N containing the + * source indexes. On exit, LINDXA contains the combined index + * array. + * + * LINDXAU (global input/output) int * + * On entry, LINDXAU is an array of dimension N containing the + * target indexes. On exit, LINDXAU contains the sequence of + * permutation, that should be applied in increasing order to + * permute the underlying array U in place. + * + * IWORK (workspace) int * + * On entry, IWORK is a workarray of dimension N. + * + * --------------------------------------------------------------------- + */ + + int i, j, k, fndd; + + /* + * Inverse LINDXA - combine LINDXA and LINDXAU - Initialize IWORK + */ + for(i = 0; i < N; i++) { IWORK[LINDXA[i]] = i; } + for(i = 0; i < N; i++) { + LINDXA[i] = LINDXAU[IWORK[i]]; + IWORK[i] = i; + } + + for(i = 0; i < N; i++) { + /* search LINDXA such that LINDXA[j] == i */ + j = 0; + do { + fndd = (LINDXA[j] == i); + j++; + } while(!fndd); + j--; + /* search IWORK such that IWORK[k] == j */ + k = 0; + do { + fndd = (IWORK[k] == j); + k++; + } while(!fndd); + k--; + /* swap IWORK[i] and IWORK[k]; LINDXAU[i] = k */ + j = IWORK[i]; + IWORK[i] = IWORK[k]; + IWORK[k] = j; + LINDXAU[i] = k; + } +} diff --git a/src/pgesv/HPL_pipid.cpp b/src/pgesv/HPL_pipid.cpp new file mode 100644 index 0000000..1d1cfff --- /dev/null +++ b/src/pgesv/HPL_pipid.cpp @@ -0,0 +1,164 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_pipid(HPL_T_panel* PANEL, int* K, int* IPID) { + /* + * Purpose + * ======= + * + * HPL_pipid computes an array IPID that contains the source and final + * destination of matrix rows resulting from the application of N + * interchanges as computed by the LU factorization with row partial + * pivoting. The array IPID is such that the row of global index IPID(i) + * should be mapped onto the row of global index IPID(i+1). Note that we + * cannot really know the length of IPID a priori. However, we know that + * this array is at least 2*N long, since there are N rows to swap and + * broadcast. The length of this array must be smaller than or equal to + * 4*N, since every row is swapped with at most a single distinct remote + * row. The algorithm constructing IPID goes as follows: Let IA be the + * global index of the first row to be swapped. + * + * For every row src IA + i with i in [0..N) to be swapped with row dst + * such that dst is given by DPIV[i]: + * + * Is row src the destination of a previous row of the current block, + * that is, is there k odd such that IPID(k) is equal to src ? + * Yes: update this destination with dst. For example, if the + * pivot array is (0,2)(1,1)(2,5) ... , then when we swap rows 2 and 5, + * we swap in fact row 0 and 5, i.e., row 0 goes to 5 and not 2 as it + * was thought so far ... + * No : add the pair (src,dst) at the end of IPID; row src has not + * been moved yet. + * + * Is row dst different from src the destination of a previous row of + * the current block, i.e., is there k odd such that IPID(k) is equal to + * dst ? + * Yes: update IPID(k) with src. For example, if the pivot array + * is (0,5)(1,1)(2,5) ... , then when we swap rows 2 and 5, we swap in + * fact row 2 and 0, i.e., row 0 goes to 2 and not 5 as it was thought + * so far ... + * No : add the pair (dst,src) at the end of IPID; row dst has not + * been moved yet. + * + * Note that when src is equal to dst, the pair (dst,src) should not be + * added to IPID in order to avoid duplicated entries in this array. + * During the construction of the array IPID, we make sure that the + * first N entries are such that IPID(k) with k odd is equal to IA+k/2. + * For k in [0..K/2), the row of global index IPID(2*k) should be + * mapped onto the row of global index IPID(2*k+1). + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * K (global output) int * + * On exit, K specifies the number of entries in IPID. K is at + * least 2*N, and at most 4*N. + * + * IPID (global output) int * + * On entry, IPID is an array of length 4*N. On exit, the first + * K entries of that array contain the src and final destination + * resulting from the application of the N interchanges as + * specified by DPIV. The pairs (src,dst) are contiguously + * stored and sorted so that IPID(2*i+1) is equal to IA+i with i + * in [0..N) + * + * --------------------------------------------------------------------- + */ + + int dst, fndd, fnds, ia, i, j, jb, lst, off, src; + int* ipiv; + + ipiv = PANEL->ipiv; + jb = PANEL->jb; + src = ia = PANEL->ia; + dst = (int)(ipiv[0]); + IPID[0] = dst; + IPID[1] = src; + *K = 2; + if(src != dst) { + IPID[2] = src; + IPID[3] = dst; + *K += 2; + } + + for(i = 1; i < jb; i++) { + fnds = 0; + j = 1; + + if((src = ia + i) == (dst = (int)(ipiv[i]))) { + do { + if(src == IPID[j]) { + fnds = j; + } else { + j += 2; + } + } while(!(fnds) && (j < *K)); + if(!fnds) { + lst = *K; + off = 2; + IPID[lst] = src; + } else { + lst = fnds - 1; + off = 0; + } + IPID[lst + 1] = dst; + } else { + fndd = 0; + do { + if(src == IPID[j]) { + fnds = j; + } else if(dst == IPID[j]) { + fndd = j; + } + j += 2; + } while((!(fnds) || !(fndd)) && (j < *K)); + if(!fnds) { + IPID[*K] = src; + IPID[*K + 1] = dst; + off = 2; + } else { + IPID[fnds] = dst; + off = 0; + } + if(!fndd) { + lst = *K + off; + IPID[lst] = dst; + off += 2; + } else { + lst = fndd - 1; + } + IPID[lst + 1] = src; + } + /* + * Enforce IPID(1,i) equal to src = ia + i + */ + if(lst != (j = (i << 1))) { + src = IPID[j]; + IPID[j] = IPID[lst]; + IPID[lst] = src; + dst = IPID[j + 1]; + IPID[j + 1] = IPID[lst + 1]; + IPID[lst + 1] = dst; + } + *K += off; + } +} diff --git a/src/pgesv/HPL_piplen.cpp b/src/pgesv/HPL_piplen.cpp new file mode 100644 index 0000000..d11ea9e --- /dev/null +++ b/src/pgesv/HPL_piplen.cpp @@ -0,0 +1,58 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_piplen(HPL_T_panel* PANEL, + const int K, + const int* IPID, + int* IPLEN, + int* IWORK) { + + const int nprow = PANEL->grid->nprow; + const int jb = PANEL->jb; + const int nb = PANEL->nb; + const int ia = PANEL->ia; + const int icurrow = PANEL->prow; + + int* iwork = IWORK + jb; + + /* + * Compute IPLEN + */ + for(int i = 0; i <= nprow; i++) IPLEN[i] = 0; + + /* + * IPLEN[i] is the number of rows of A in the processes before + * process i, with the convention that IPLEN[nprow] is the total + * number of rows. + * In other words, IPLEN[i+1] - IPLEN[i] is the local number of + * rows of A that should be moved for each process. + */ + for(int i = 0; i < K; i += 2) { + const int src = IPID[i]; + int srcrow; + Mindxg2p(src, nb, nb, srcrow, 0, nprow); + if(srcrow == icurrow) { + const int dst = IPID[i + 1]; + int dstrow; + Mindxg2p(dst, nb, nb, dstrow, 0, nprow); + if((dstrow != srcrow) || (dst - ia < jb)) IPLEN[dstrow + 1]++; + } + } + + for(int i = 1; i <= nprow; i++) { IPLEN[i] += IPLEN[i - 1]; } +} diff --git a/src/pgesv/HPL_plindx.cpp b/src/pgesv/HPL_plindx.cpp new file mode 100644 index 0000000..8344008 --- /dev/null +++ b/src/pgesv/HPL_plindx.cpp @@ -0,0 +1,238 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +void HPL_plindx(HPL_T_panel* PANEL, + const int K, + const int* IPID, + int* IPA, + int* LINDXU, + int* LINDXAU, + int* LINDXA, + int* IPLEN, + int* PERMU, + int* IWORK) { + /* + * Purpose + * ======= + * + * HPL_plindx computes three local arrays LINDXU, LINDXA, and LINDXAU + * containing the local source and final destination position resulting + * from the application of row interchanges. In addition, this function + * computes the array IPLEN that contains the mapping information for the + * spreading phase. + * + * Arguments + * ========= + * + * PANEL (local input/output) HPL_T_panel * + * On entry, PANEL points to the data structure containing the + * panel information. + * + * K (global input) const int + * On entry, K specifies the number of entries in IPID. K is at + * least 2*N, and at most 4*N. + * + * IPID (global input) const int * + * On entry, IPID is an array of length K. The first K entries + * of that array contain the src and final destination resulting + * from the application of the interchanges. + * + * IPA (global output) int * + * On exit, IPA specifies the number of rows that the current + * process row has that should be swapped with local rows of A. + * + * LINDXU (global output) int * + * On entry, LINDXU is an array of dimension N. On exit, this + * array contains the local indexes of the rows of A I have that + * should be copied into U. + * + * LINDXAU (global output) int * + * On entry, LINDXAU is an array of dimension N. On exit, this + * array contains the local source indexes of the rows of A I + * have that should be swapped locally. + * + * LINDXA (global output) int * + * On entry, LINDXA is an array of dimension N. On exit, this + * array contains the local destination indexes of the rows + * of A I have that should be swapped locally. + * + * IPLEN (global output) int * + * On entry, IPLEN is an array of dimension NPROW + 1. On exit, + * this array is such that IPLEN[i] is the number of rows of A + * in the processes before process IPMAP[i] after the sort + * with the convention that IPLEN[nprow] is the total number of + * rows of the panel. In other words IPLEN[i+1]-IPLEN[i] is the + * local number of rows of A that should be moved to the process + * IPMAP[i]. IPLEN is such that the number of rows of the source + * process row can be computed as IPLEN[1] - IPLEN[0], and the + * remaining entries of this array are sorted so that the + * quantities IPLEN[i+1] - IPLEN[i] are logarithmically sorted. + * + * PERMU (global output) int * + * On entry, PERMU is an array of dimension JB. On exit, PERMU + * contains a sequence of permutations, that should be applied + * in increasing order to permute in place the row panel U. + * + * IWORK (workspace) int * + * On entry, IWORK is a workarray of dimension 2*JB. + * + * --------------------------------------------------------------------- + */ + const int myrow = PANEL->grid->myrow; + const int nprow = PANEL->grid->nprow; + const int jb = PANEL->jb; + const int nb = PANEL->nb; + const int ia = PANEL->ia; + const int iroff = PANEL->ii; + const int icurrow = PANEL->prow; + + int* iwork = IWORK + jb; + + /* + * Compute IPLEN + */ + HPL_piplen(PANEL, K, IPID, IPLEN, IWORK); + + /* + * Compute the local arrays LINDXA and LINDXAU containing the local + * source and final destination position resulting from the application + * of N interchanges. Compute LINDXA and LINDXAU in icurrow, and LINDXA + * elsewhere and PERMU in every process. + */ + if(myrow == icurrow) { + // for all rows to be swapped + int ip = 0, ipU = 0; + for(int i = 0; i < K; i += 2) { + const int src = IPID[i]; + int srcrow; + Mindxg2p(src, nb, nb, srcrow, 0, nprow); + + if(srcrow == icurrow) { // if I own the src row + const int dst = IPID[i + 1]; + int dstrow; + Mindxg2p(dst, nb, nb, dstrow, 0, nprow); + + int il; + Mindxg2l(il, src, nb, nb, myrow, 0, nprow); + + if((dstrow == icurrow) && (dst - ia < jb)) { + // if I own the dst and it's in U + + PERMU[ipU] = dst - ia; // row index in U + iwork[ipU] = IPLEN[dstrow]; // Index in AllGathered U + ipU++; + + LINDXU[IPLEN[dstrow]] = il - iroff; // Index in AllGathered U + IPLEN[dstrow]++; + } else if(dstrow != icurrow) { + // else if I don't own the dst + + // Find the IPID pair with dst as the source + int j = 0; + int fndd; + do { + fndd = (dst == IPID[j]); + j += 2; + } while(!fndd && (j < K)); + // This pair must have dst being sent to a position in U + + PERMU[ipU] = IPID[j - 1] - ia; // row index in U + iwork[ipU] = IPLEN[dstrow]; // Index in AllGathered U + ipU++; + + LINDXU[IPLEN[dstrow]] = il - iroff; // Index in AllGathered U + IPLEN[dstrow]++; + } else if((dstrow == icurrow) && (dst - ia >= jb)) { + // else I own the dst, but it's not in U + + LINDXAU[ip] = il - iroff; // the src row must be in the first jb rows + + int il; + Mindxg2l(il, dst, nb, nb, myrow, 0, nprow); + LINDXA[ip] = il - iroff; // the dst is somewhere below + ip++; + } + } + } + *IPA = ip; + } else { + // for all rows to be swapped + int ip = 0, ipU = 0; + for(int i = 0; i < K; i += 2) { + const int src = IPID[i]; + int srcrow; + Mindxg2p(src, nb, nb, srcrow, 0, nprow); + const int dst = IPID[i + 1]; + int dstrow; + Mindxg2p(dst, nb, nb, dstrow, 0, nprow); + /* + * LINDXU[i] is the local index of the row of A that belongs into U + */ + if(myrow == dstrow) { // if I own the dst row + int il; + Mindxg2l(il, dst, nb, nb, myrow, 0, nprow); + LINDXU[ip] = il - iroff; // Local A index of incoming row + ip++; + } + /* + * iwork[i] is the local (current) position index in U + * PERMU[i] is the local (final) destination index in U + */ + + // if the src row is coming from the current row rank + if(srcrow == icurrow) { + + if((dstrow == icurrow) && (dst - ia < jb)) { + // If the row is going into U + PERMU[ipU] = dst - ia; // row index in U + iwork[ipU] = IPLEN[dstrow]; // Index in AllGathered U + IPLEN[dstrow]++; + ipU++; + } else if(dstrow != icurrow) { + // If the row is going to another rank + // (So src must be in U) + + // Find the IPID pair with dst as the source + int j = 0; + int fndd; + do { + fndd = (dst == IPID[j]); + j += 2; + } while(!fndd && (j < K)); + // This pair must have dst being sent to a position in U + + PERMU[ipU] = IPID[j - 1] - ia; // row index in U + iwork[ipU] = IPLEN[dstrow]; // Index in AllGathered U + IPLEN[dstrow]++; + ipU++; + } + } + } + *IPA = 0; + } + /* + * Simplify iwork and PERMU, return in PERMU the sequence of permutation + * that need to be apply to U after it has been broadcast. + */ + HPL_perm(jb, iwork, PERMU, IWORK); + /* + * Reset IPLEN to its correct value + */ + for(int i = nprow; i > 0; i--) IPLEN[i] = IPLEN[i - 1]; + IPLEN[0] = 0; +} diff --git a/src/timer/HPL_ptimer.cpp b/src/timer/HPL_ptimer.cpp new file mode 100644 index 0000000..53b82ec --- /dev/null +++ b/src/timer/HPL_ptimer.cpp @@ -0,0 +1,262 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" +/* + * --------------------------------------------------------------------- + * Static variables + * --------------------------------------------------------------------- + */ +static int HPL_ptimer_disabled; +static double HPL_ptimer_cpusec[HPL_NPTIMER], HPL_ptimer_cpustart[HPL_NPTIMER]; +static double HPL_ptimer_wallsec[HPL_NPTIMER], + HPL_ptimer_wallstart[HPL_NPTIMER]; +static double HPL_ptimer_wallstep[HPL_NPTIMER]; +/* + * --------------------------------------------------------------------- + * User callable functions + * --------------------------------------------------------------------- + */ +void HPL_ptimer_boot() { + /* + * HPL_ptimer_boot (re)sets all timers to 0, and enables HPL_ptimer. + */ + + int i; + + HPL_ptimer_disabled = 0; + + for(i = 0; i < HPL_NPTIMER; i++) { + HPL_ptimer_cpusec[i] = HPL_ptimer_wallsec[i] = HPL_rzero; + HPL_ptimer_wallstep[i] = HPL_rzero; + HPL_ptimer_cpustart[i] = HPL_ptimer_wallstart[i] = HPL_PTIMER_STARTFLAG; + } +} + +void HPL_ptimer(const int I) { + /* + * Purpose + * ======= + * + * HPL_ptimer provides a "stopwatch" functionality cpu/wall timer in + * seconds. Up to 64 separate timers can be functioning at once. The + * first call starts the timer, and the second stops it. This routine + * can be disenabled by calling HPL_ptimer_disable(), so that calls to + * the timer are ignored. This feature can be used to make sure certain + * sections of code do not affect timings, even if they call routines + * which have HPL_ptimer calls in them. HPL_ptimer_enable() will enable + * the timer functionality. One can retrieve the current value of a + * timer by calling + * + * t0 = HPL_ptimer_inquire( HPL_WALL_TIME | HPL_CPU_TIME, I ) + * + * where I is the timer index in [0..64). To inititialize the timer + * functionality, one must have called HPL_ptimer_boot() prior to any of + * the functions mentioned above. + * + * Arguments + * ========= + * + * I (global input) const int + * On entry, I specifies the timer to stop/start. + * + * --------------------------------------------------------------------- + */ + + if(HPL_ptimer_disabled) return; + /* + * If timer has not been started, start it. Otherwise, stop it and add + * interval to count + */ + if(HPL_ptimer_wallstart[I] == HPL_PTIMER_STARTFLAG) { + HPL_ptimer_wallstart[I] = HPL_ptimer_walltime(); + HPL_ptimer_cpustart[I] = HPL_ptimer_cputime(); + } else { + HPL_ptimer_cpusec[I] += HPL_ptimer_cputime() - HPL_ptimer_cpustart[I]; + const double walltime = HPL_ptimer_walltime() - HPL_ptimer_wallstart[I]; + HPL_ptimer_wallstep[I] += walltime; + HPL_ptimer_wallsec[I] += walltime; + HPL_ptimer_wallstart[I] = HPL_PTIMER_STARTFLAG; + } +} + +void HPL_ptimer_enable(void) { + /* + * HPL_ptimer_enable sets it so calls to HPL_ptimer are not ignored. + */ + + HPL_ptimer_disabled = 0; + return; +} + +void HPL_ptimer_disable(void) { + /* + * HPL_ptimer_disable sets it so calls to HPL_ptimer are ignored. + */ + + HPL_ptimer_disabled = 1; + return; +} + +void HPL_ptimer_stepReset(const int N, const int IBEG) { + for(int i = 0; i < N; i++) { HPL_ptimer_wallstep[IBEG + i] = HPL_rzero; } +} + +double HPL_ptimer_getStep(const int I) { + + double time; + + /* + * If wall-time are not available on this machine, return + * HPL_PTIMER_ERROR + */ + if(HPL_ptimer_walltime() == HPL_PTIMER_ERROR) + time = HPL_PTIMER_ERROR; + else + time = HPL_ptimer_wallstep[I]; + + return (time); +} + +double HPL_ptimer_inquire(const HPL_T_PTIME TMTYPE, const int I) { + /* + * Purpose + * ======= + * + * HPL_ptimer_inquire returns wall- or cpu- time that has accumulated in + * timer I. + * + * Arguments + * ========= + * + * TMTYPE (global input) const HPL_T_PTIME + * On entry, TMTYPE specifies what time will be returned as fol- + * lows + * = HPL_WALL_PTIME : wall clock time is returned, + * = HPL_CPU_PTIME : CPU time is returned (default). + * + * I (global input) const int + * On entry, I specifies the timer to return. + * + * --------------------------------------------------------------------- + */ + + double time; + + /* + * If wall- or cpu-time are not available on this machine, return + * HPL_PTIMER_ERROR + */ + if(TMTYPE == HPL_WALL_PTIME) { + if(HPL_ptimer_walltime() == HPL_PTIMER_ERROR) + time = HPL_PTIMER_ERROR; + else + time = HPL_ptimer_wallsec[I]; + } else { + if(HPL_ptimer_cputime() == HPL_PTIMER_ERROR) + time = HPL_PTIMER_ERROR; + else + time = HPL_ptimer_cpusec[I]; + } + return (time); +} + +void HPL_ptimer_combine(MPI_Comm COMM, + const HPL_T_PTIME_OP OPE, + const HPL_T_PTIME TMTYPE, + const int N, + const int IBEG, + double* TIMES) { + /* + * Purpose + * ======= + * + * HPL_ptimer_combine combines the timing information stored on a scope + * of processes into the user TIMES array. + * + * Arguments + * ========= + * + * COMM (global/local input) MPI_Comm + * The MPI communicator identifying the process collection on + * which the timings are taken. + * + * OPE (global input) const HPL_T_PTIME_OP + * On entry, OP specifies what combine operation should be done + * as follows: + * = HPL_AMAX_PTIME get max. time on any process (default), + * = HPL_AMIN_PTIME get min. time on any process, + * = HPL_SUM_PTIME get sum of times across processes. + * + * TMTYPE (global input) const HPL_T_PTIME + * On entry, TMTYPE specifies what time will be returned as fol- + * lows + * = HPL_WALL_PTIME : wall clock time is returned, + * = HPL_CPU_PTIME : CPU time is returned (default). + * + * N (global input) const int + * On entry, N specifies the number of timers to combine. + * + * IBEG (global input) const int + * On entry, IBEG specifies the first timer to be combined. + * + * TIMES (global output) double * + * On entry, TIMES is an array of dimension at least N. On exit, + * this array contains the requested timing information. + * + * --------------------------------------------------------------------- + */ + + int i, tmpdis; + + tmpdis = HPL_ptimer_disabled; + HPL_ptimer_disabled = 1; + /* + * Timer has been disabled for combine operation - copy timing informa- + * tion into user times array. If wall- or cpu-time are not available + * on this machine, fill in times with HPL_PTIMER_ERROR flag and return. + */ + if(TMTYPE == HPL_WALL_PTIME) { + if(HPL_ptimer_walltime() == HPL_PTIMER_ERROR) { + for(i = 0; i < N; i++) TIMES[i] = HPL_PTIMER_ERROR; + return; + } else { + for(i = 0; i < N; i++) TIMES[i] = HPL_ptimer_wallsec[IBEG + i]; + } + } else { + if(HPL_ptimer_cputime() == HPL_PTIMER_ERROR) { + for(i = 0; i < N; i++) TIMES[i] = HPL_PTIMER_ERROR; + return; + } else { + for(i = 0; i < N; i++) TIMES[i] = HPL_ptimer_cpusec[IBEG + i]; + } + } + /* + * Combine all nodes information, restore HPL_ptimer_disabled, and return + */ + for(i = 0; i < N; i++) TIMES[i] = Mmax(HPL_rzero, TIMES[i]); + + if(OPE == HPL_AMAX_PTIME) + (void)HPL_all_reduce((void*)(TIMES), N, HPL_DOUBLE, HPL_MAX, COMM); + else if(OPE == HPL_AMIN_PTIME) + (void)HPL_all_reduce((void*)(TIMES), N, HPL_DOUBLE, HPL_MIN, COMM); + else if(OPE == HPL_SUM_PTIME) + (void)HPL_all_reduce((void*)(TIMES), N, HPL_DOUBLE, HPL_SUM, COMM); + else + (void)HPL_all_reduce((void*)(TIMES), N, HPL_DOUBLE, HPL_MAX, COMM); + + HPL_ptimer_disabled = tmpdis; +} diff --git a/src/timer/HPL_ptimer_cputime.cpp b/src/timer/HPL_ptimer_cputime.cpp new file mode 100644 index 0000000..a3f1577 --- /dev/null +++ b/src/timer/HPL_ptimer_cputime.cpp @@ -0,0 +1,45 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +/* + * Purpose + * ======= + * + * HPL_ptimer_cputime returns the cpu time. + * The clock() function is used to return an approximation of processor + * time used by the program. The value returned is the CPU time used so + * far as a clock_t; to get the number of seconds used, the result is + * divided by CLOCKS_PER_SEC. This function is part of the ANSI/ISO C + * standard library. + * + * --------------------------------------------------------------------- + */ + +#include + +double HPL_ptimer_cputime(void) { + static double cps = CLOCKS_PER_SEC; + double d; + clock_t t1; + static clock_t t0 = 0; + + if(t0 == 0) t0 = clock(); + t1 = clock() - t0; + d = (double)(t1) / cps; + return (d); +} diff --git a/src/timer/HPL_ptimer_walltime.cpp b/src/timer/HPL_ptimer_walltime.cpp new file mode 100644 index 0000000..de35681 --- /dev/null +++ b/src/timer/HPL_ptimer_walltime.cpp @@ -0,0 +1,29 @@ +/* --------------------------------------------------------------------- + * -- High Performance Computing Linpack Benchmark (HPL) + * HPL - 2.2 - February 24, 2016 + * Antoine P. Petitet + * University of Tennessee, Knoxville + * Innovative Computing Laboratory + * (C) Copyright 2000-2008 All Rights Reserved + * + * Modified by: Noel Chalmers + * (C) 2018-2022 Advanced Micro Devices, Inc. + * See the rocHPL/LICENCE file for details. + * + * SPDX-License-Identifier: (BSD-3-Clause) + * --------------------------------------------------------------------- + */ + +#include "hpl.hpp" + +/* + * Purpose + * ======= + * + * HPL_ptimer_walltime returns the elapsed (wall-clock) time. + * + * + * --------------------------------------------------------------------- + */ + +double HPL_ptimer_walltime(void) { return (MPI_Wtime()); }