From f379dfea164d36b423d5dc4781580dd2bc184027 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Fri, 24 Jan 2025 12:10:10 +0000 Subject: [PATCH 01/10] Remove unecessary CUDAFOR imports --- src/core/dev_alloc_module.fypp | 10 +--------- src/core/field_RANKSUFF_module.fypp | 3 --- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/src/core/dev_alloc_module.fypp b/src/core/dev_alloc_module.fypp index 22c9e93..c86bda0 100644 --- a/src/core/dev_alloc_module.fypp +++ b/src/core/dev_alloc_module.fypp @@ -13,15 +13,7 @@ MODULE DEV_ALLOC_MODULE ${fieldType.useParkind1 ()}$ -#:if defined('USE_BUDDY_MALLOC') or defined('CUDA') -USE ISO_C_BINDING -#:endif -#ifdef _OPENACC -USE OPENACC -#endif -#:if defined('CUDA') -USE CUDAFOR -#:endif +USE, INTRINSIC :: ISO_C_BINDING IMPLICIT NONE diff --git a/src/core/field_RANKSUFF_module.fypp b/src/core/field_RANKSUFF_module.fypp index 0a7c151..3644404 100644 --- a/src/core/field_RANKSUFF_module.fypp +++ b/src/core/field_RANKSUFF_module.fypp @@ -19,9 +19,6 @@ USE HOST_ALLOC_MODULE USE FIELD_BASIC_MODULE USE FIELD_CONSTANTS_MODULE USE FIELD_DEFAULTS_MODULE -#:if defined('CUDA') -USE CUDAFOR -#:endif #ifdef _OPENACC USE OPENACC #endif From e64e194d988faaff55cd07e90b7dabc8f06bc56c Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Fri, 24 Jan 2025 14:18:48 +0000 Subject: [PATCH 02/10] Remove CUDAFOR from HOST_ALLOC_MODULE --- src/core/host_alloc_module.fypp | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/src/core/host_alloc_module.fypp b/src/core/host_alloc_module.fypp index f5ef5d6..13fee5d 100644 --- a/src/core/host_alloc_module.fypp +++ b/src/core/host_alloc_module.fypp @@ -12,10 +12,7 @@ MODULE HOST_ALLOC_MODULE #:set fieldTypeList = fieldType.getFieldTypeList () ${fieldType.useParkind1 ()}$ -USE ISO_C_BINDING -#:if defined('CUDA') -USE CUDAFOR -#:endif +USE, INTRINSIC::ISO_C_BINDING USE, INTRINSIC :: ISO_FORTRAN_ENV, ONLY : INT64 USE FIELD_DEFAULTS_MODULE USE FIELD_STATISTICS_MODULE @@ -67,6 +64,21 @@ INTERFACE END SUBROUTINE C_PTR_INCR END INTERFACE +#:if defined('CUDA') +INTERFACE + INTEGER FUNCTION CUDA_HOST_REGISTER (PTR, SIZ, FLAGS) BIND (C, NAME='cudaHostRegister') + IMPORT :: C_PTR, C_SIZE_T, C_INT + TYPE (C_PTR), VALUE, INTENT(IN) :: PTR + INTEGER (C_SIZE_T), VALUE, INTENT(IN) :: SIZ + INTEGER (C_INT), VALUE, INTENT(IN) :: FLAGS + END FUNCTION CUDA_HOST_REGISTER + INTEGER FUNCTION CUDA_HOST_UNREGISTER (PTR) BIND (C, NAME='cudaHostUnregister') + IMPORT :: C_PTR + TYPE (C_PTR), VALUE, INTENT(IN) :: PTR + END FUNCTION CUDA_HOST_UNREGISTER +END INTERFACE +#:endif + TYPE :: MEM_BLOCK TYPE(C_PTR) :: DATA = C_NULL_PTR INTEGER(KIND=INT64) :: POS = 0 @@ -259,7 +271,10 @@ SUBROUTINE PIN_ALLOCATION(DATA, ARR_SIZE, ISTAT) INTEGER, INTENT(OUT) :: ISTAT INTEGER(C_SIZE_T), INTENT(IN) :: ARR_SIZE - ISTAT = CUDAHOSTREGISTER (DATA, ARR_SIZE, CUDAHOSTREGISTERMAPPED) + INTEGER(C_INT) :: FLAGS + + FLAGS = 2 !... Corresponds to cudaHostRegisterMapped + ISTAT = CUDA_HOST_REGISTER (DATA, ARR_SIZE, FLAGS) END SUBROUTINE PIN_ALLOCATION @@ -268,7 +283,7 @@ SUBROUTINE UNPIN_ALLOCATION(DATA, ISTAT) TYPE(C_PTR), INTENT(INOUT) :: DATA INTEGER, INTENT(OUT) :: ISTAT - ISTAT = CUDAHOSTUNREGISTER (DATA) + ISTAT = CUDA_HOST_UNREGISTER (DATA) END SUBROUTINE UNPIN_ALLOCATION #:endif From 96240dcfebbb79dc07cb8b493cf2db4895c57f51 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Tue, 28 Jan 2025 15:48:11 +0000 Subject: [PATCH 03/10] FYPP: fypp now only supported if installed as a pip package --- cmake/field_api_fetchcontent_fypp.cmake | 20 -------------------- cmake/field_api_find_fypp.cmake | 19 +++++++++---------- 2 files changed, 9 insertions(+), 30 deletions(-) delete mode 100644 cmake/field_api_fetchcontent_fypp.cmake diff --git a/cmake/field_api_fetchcontent_fypp.cmake b/cmake/field_api_fetchcontent_fypp.cmake deleted file mode 100644 index ab7158d..0000000 --- a/cmake/field_api_fetchcontent_fypp.cmake +++ /dev/null @@ -1,20 +0,0 @@ -# (C) Copyright 2022- ECMWF. -# (C) Copyright 2022- Meteo-France. -# -# This software is licensed under the terms of the Apache Licence Version 2.0 -# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -# In applying this licence, ECMWF does not waive the privileges and immunities -# granted to it by virtue of its status as an intergovernmental organisation -# nor does it submit to any jurisdiction. - -# Download fypp preprocessor if not found. - -include(FetchContent) - -FetchContent_Declare( - fypp - GIT_REPOSITORY https://github.com/aradi/fypp - GIT_TAG 3.1 -) - -FetchContent_MakeAvailable(fypp) diff --git a/cmake/field_api_find_fypp.cmake b/cmake/field_api_find_fypp.cmake index 8b4cbaa..5233794 100644 --- a/cmake/field_api_find_fypp.cmake +++ b/cmake/field_api_find_fypp.cmake @@ -26,16 +26,15 @@ macro( field_api_find_fypp ) if( fckit_FOUND AND fckit_HAVE_FCKIT_VENV ) set( FYPP ${FCKIT_VENV_EXE} -m fypp ) - elseif( fckit_FOUND ) - # This is only needed for building in environments with python3 older than 3.8 - list( APPEND _fckit_fypp_path "${FYPP}" ) - list( LENGTH _fckit_fypp_path _list_length ) - MATH( EXPR _last_entry "${_list_length} - 1" ) - list( GET _fckit_fypp_path ${_last_entry} FYPP ) - elseif( FYPP MATCHES FYPP-NOTFOUND ) - include(cmake/field_api_fetchcontent_fypp.cmake) - set(FYPP ${fypp_SOURCE_DIR}/bin/fypp) - ecbuild_info("fypp downloaded to: ${FYPP}") + elseif( FYPP MATCHES FYPP-NOTFOUND OR (fckit_FOUND AND (NOT fckit_HAVE_FCKIT_VENV OR NOT DEFINED fckit_HAVE_FCKIT_VENV)) ) + # Discover only system install Python 3 + set( Python3_FIND_VIRTUALENV STANDARD ) + find_package( Python3 COMPONENTS Interpreter ) + + execute_process( COMMAND ${Python3_EXECUTABLE} -m ensurepip --upgrade OUTPUT_QUIET ) + execute_process( COMMAND ${Python3_EXECUTABLE} -m pip --disable-pip-version-check install fypp OUTPUT_QUIET ) + ecbuild_info("field_api installed fypp as a pip package") + set( FYPP ${Python3_EXECUTABLE} -m fypp ) endif() endmacro() From b1b714f4fbdf97442d98089e9f1a0f65c2203f58 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Mon, 27 Jan 2025 20:37:02 +0000 Subject: [PATCH 04/10] WIP: implement macro based offload instructions for Nvidia OpenACC in FIELD_RANKSUFF_DATA_MODULE --- .gitignore | 1 + CMakeLists.txt | 27 +-- cmake/field_api_expand_fypp_ranksuff.cmake | 2 + cmake/field_api_get_offload_model.cmake | 57 ++++++ cmake/field_api_macros.cmake | 1 + python_utils/offload_backends/__init__.py | 10 ++ .../offload_backends/nvhpc/__init__.py | 10 ++ .../offload_backends/nvhpc/openacc.py | 91 ++++++++++ python_utils/offload_macros.py | 167 ++++++++++++++++++ src/core/field_RANKSUFF_data_module.fypp | 20 +-- 10 files changed, 351 insertions(+), 35 deletions(-) create mode 100644 cmake/field_api_get_offload_model.cmake create mode 100644 python_utils/offload_backends/__init__.py create mode 100644 python_utils/offload_backends/nvhpc/__init__.py create mode 100644 python_utils/offload_backends/nvhpc/openacc.py create mode 100644 python_utils/offload_macros.py diff --git a/.gitignore b/.gitignore index e76fd93..030739f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ field_module.F90 *.swp build +**__pycache__ diff --git a/CMakeLists.txt b/CMakeLists.txt index 133eb24..f15de9d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,24 +33,8 @@ ecbuild_find_package(OpenMP COMPONENTS Fortran REQUIRED) ## find fypp field_api_find_fypp() -## find OpenACC -if( ${CMAKE_VERSION} VERSION_LESS "3.25" ) - if ( FIELD_API_ENABLE_ACC OR (NOT DEFINED FIELD_API_ENABLE_ACC AND (ENABLE_ACC OR NOT DEFINED ENABLE_ACC)) ) - # See https://gitlab.kitware.com/cmake/cmake/-/issues/23691, fixed in CMake 3.25 - # (TL;DR: FindOpenACC sets OpenACC__FOUND correctly but does not set - # OpenACC_FOUND unless all three C, CXX, and Fortran have been found - even if - # only one language has been requested via COMPONENTS) - find_package( OpenACC COMPONENTS Fortran ) - if( OpenACC_Fortran_FOUND ) - set( OpenACC_FOUND ON ) - endif() - endif() -endif() -ecbuild_add_option( FEATURE ACC - DEFAULT ON - DESCRIPTION "Support for using GPUs with OpenACC" - REQUIRED_PACKAGES "OpenACC COMPONENTS Fortran" - CONDITION CMAKE_Fortran_COMPILER_ID MATCHES "PGI|NVHPC") +## determine GPU offload model +field_api_get_offload_model() ## set general compiler flags field_api_compile_options() @@ -75,13 +59,6 @@ endif() ## find fiat field_api_find_fiat_modules() -## check for CUDA -include(CheckLanguage) -check_language(CUDA) -ecbuild_add_option( FEATURE CUDA - DESCRIPTION "CUDA" DEFAULT ON - CONDITION CMAKE_CUDA_COMPILER AND HAVE_ACC ) - ## buddy allocator option ecbuild_add_option( FEATURE BUDDY_MALLOC DESCRIPTION "Use buddy allocator for shadow host allocation" diff --git a/cmake/field_api_expand_fypp_ranksuff.cmake b/cmake/field_api_expand_fypp_ranksuff.cmake index 77fc175..d811f30 100644 --- a/cmake/field_api_expand_fypp_ranksuff.cmake +++ b/cmake/field_api_expand_fypp_ranksuff.cmake @@ -51,6 +51,7 @@ macro( field_api_expand_fypp_ranksuff ) add_custom_command (OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/field_${RANK}${suff}${FUNC}_module.F90 COMMAND ${FYPP} -DRANK=${RANK} -DSUFF='${SUFF}' ${fypp_defines} -m os -M ${_PAR_PYTHON_MODULE_DIR} -m fieldType + -DOFFLOAD_MODEL="${FIELD_API_OFFLOAD_MODEL}" -M ${_PAR_PYTHON_MODULE_DIR} -m offload_macros ${_PAR_SOURCE_DIR}/field_RANKSUFF${FUNC}_module.fypp > ${CMAKE_CURRENT_BINARY_DIR}/field_${RANK}${suff}${FUNC}_module.F90 DEPENDS ${_PAR_SOURCE_DIR}/field_RANKSUFF${FUNC}_module.fypp VERBATIM) @@ -62,6 +63,7 @@ macro( field_api_expand_fypp_ranksuff ) add_custom_command (OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/field_${RANK}${suff}${FUNC}_module.F90 COMMAND ${FYPP} -DRANK=${RANK} -DSUFF='${SUFF}' ${fypp_defines} -m os -M ${_PAR_PYTHON_MODULE_DIR} -m fieldType + -DOFFLOAD_MODEL="${FIELD_API_OFFLOAD_MODEL}" -M ${_PAR_PYTHON_MODULE_DIR} -m offload_macros ${_PAR_SOURCE_DIR}/field_RANKSUFF${FUNC}_module.fypp > ${CMAKE_CURRENT_BINARY_DIR}/field_${RANK}${suff}${FUNC}_module.F90 DEPENDS ${_PAR_SOURCE_DIR}/field_RANKSUFF${FUNC}_module.fypp VERBATIM) diff --git a/cmake/field_api_get_offload_model.cmake b/cmake/field_api_get_offload_model.cmake new file mode 100644 index 0000000..62a4c65 --- /dev/null +++ b/cmake/field_api_get_offload_model.cmake @@ -0,0 +1,57 @@ +# (C) Copyright 2022- ECMWF. +# (C) Copyright 2022- Meteo-France. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +############################################################################## +#.rst: +# +# field_api_get_offload_model +# =========================== +# +# Determine the GPU offload model to be used. :: +# +# field_api_get_offload_model() +# +############################################################################## + +macro( field_api_get_offload_model ) + + ## find OpenACC + if( ${CMAKE_VERSION} VERSION_LESS "3.25" ) + if ( FIELD_API_ENABLE_ACC OR (NOT DEFINED FIELD_API_ENABLE_ACC AND (ENABLE_ACC OR NOT DEFINED ENABLE_ACC)) ) + # See https://gitlab.kitware.com/cmake/cmake/-/issues/23691, fixed in CMake 3.25 + # (TL;DR: FindOpenACC sets OpenACC__FOUND correctly but does not set + # OpenACC_FOUND unless all three C, CXX, and Fortran have been found - even if + # only one language has been requested via COMPONENTS) + find_package( OpenACC COMPONENTS Fortran ) + if( OpenACC_Fortran_FOUND ) + set( OpenACC_FOUND ON ) + endif() + endif() + endif() + ecbuild_add_option( FEATURE ACC + DEFAULT ON + DESCRIPTION "Support for using GPUs with OpenACC" + REQUIRED_PACKAGES "OpenACC COMPONENTS Fortran" + CONDITION CMAKE_Fortran_COMPILER_ID MATCHES "PGI|NVHPC") + + ## check for CUDA + include(CheckLanguage) + check_language(CUDA) + ecbuild_add_option( FEATURE CUDA + DESCRIPTION "CUDA" DEFAULT ON + CONDITION CMAKE_CUDA_COMPILER AND HAVE_ACC ) + + set(FIELD_API_OFFLOAD_MODEL "None") + if( HAVE_CUDA ) + set(FIELD_API_OFFLOAD_MODEL "NVHPCOpenACCCUDA") + elseif( HAVE_ACC ) + set(FIELD_API_OFFLOAD_MODEL "NVHPCOpenACC") + endif() + +endmacro() diff --git a/cmake/field_api_macros.cmake b/cmake/field_api_macros.cmake index 711a61a..e4837fe 100644 --- a/cmake/field_api_macros.cmake +++ b/cmake/field_api_macros.cmake @@ -14,3 +14,4 @@ include( field_api_expand_fypp ) include( field_api_expand_fypp_ranksuff ) include( field_api_add_object_library ) include( field_api_target_add_module_dirs ) +include( field_api_get_offload_model ) diff --git a/python_utils/offload_backends/__init__.py b/python_utils/offload_backends/__init__.py new file mode 100644 index 0000000..bdb1034 --- /dev/null +++ b/python_utils/offload_backends/__init__.py @@ -0,0 +1,10 @@ +# (C) Copyright 2022- ECMWF. +# (C) Copyright 2022- Meteo-France. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +from offload_backends.nvhpc import * diff --git a/python_utils/offload_backends/nvhpc/__init__.py b/python_utils/offload_backends/nvhpc/__init__.py new file mode 100644 index 0000000..943bffd --- /dev/null +++ b/python_utils/offload_backends/nvhpc/__init__.py @@ -0,0 +1,10 @@ +# (C) Copyright 2022- ECMWF. +# (C) Copyright 2022- Meteo-France. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +from offload_backends.nvhpc.openacc import * diff --git a/python_utils/offload_backends/nvhpc/openacc.py b/python_utils/offload_backends/nvhpc/openacc.py new file mode 100644 index 0000000..b129ed7 --- /dev/null +++ b/python_utils/offload_backends/nvhpc/openacc.py @@ -0,0 +1,91 @@ +# (C) Copyright 2022- ECMWF. +# (C) Copyright 2022- Meteo-France. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + + +__all__ = ['NVHPCOpenACC'] + +class NVHPCOpenACC(): + """ + A class that defines the macros needed for GPU offload using Nvidia's + OpenACC implementation. + """ + + pragma = '!$acc' + + @classmethod + def runtime_api_import(cls): + """ + Runtime API import. + """ + + return "USE OPENACC" + + @classmethod + def c_devptr_declaration(cls, symbols): + """ + Type declaration for a `C_PTR` on device. + """ + + return f"TYPE(C_DEVPTR) :: {','.join(symbols)}" + + @classmethod + def host_data_start(cls, symbols): + """ + Pragma to mark the start of a `host_data` region. + """ + + return f"!$acc host_data use_device({','.join(symbols)})" + + @classmethod + def host_data_end(cls): + """ + Pragma to mark the end of a `host_data` region. + """ + + return "!$acc end host_data" + + @classmethod + def devptr_c_loc(cls, symbol): + """ + Function to determine the C address of a device variable. + """ + + return f"C_DEVLOC({symbol})" + + @classmethod + def copy_to_device_1D(cls, dev, host, size): + """ + Copy a contiguous section of data from host to device. + """ + + return f"CALL ACC_MEMCPY_TO_DEVICE ({dev}, {host}, {size})" + + @classmethod + def copy_to_device_1D_async(cls, dev, host, queue, size): + """ + Asynchornously copy a contiguous section of data from host to device. + """ + + return f"CALL ACC_MEMCPY_TO_DEVICE_ASYNC ({dev}, {host}, {size}, {queue})" + + @classmethod + def copy_from_device_1D(cls, dev, host, size): + """ + Copy a contiguous section of data from device to host. + """ + + return f"CALL ACC_MEMCPY_FROM_DEVICE ({host}, {dev}, {size})" + + @classmethod + def copy_from_device_1D_async(cls, dev, host, size, queue): + """ + Asynchronously copy a contiguous section of data from device to host. + """ + + return f"CALL ACC_MEMCPY_FROM_DEVICE_ASYNC ({host}, {dev}, {size}, {queue})" diff --git a/python_utils/offload_macros.py b/python_utils/offload_macros.py new file mode 100644 index 0000000..3de9888 --- /dev/null +++ b/python_utils/offload_macros.py @@ -0,0 +1,167 @@ +# (C) Copyright 2022- ECMWF. +# (C) Copyright 2022- Meteo-France. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + +import fypp +from offload_backends import NVHPCOpenACC + +""" +A common entry point for retrieving macros from the various GPU offload backends. +""" + +_offload_map = { + 'NVHPCOpenACC': NVHPCOpenACC +} + +def _wrap_lines(input_str, ref_len, pragma='', indent=0): + """ + Wrap a long line. + + Parameters + ---------- + input_str : str + The long line to wrap. + ref_len : int + The maximum permissible line length. + pragma : str + The pragma keyword to start new lines with. + indent : int + The length of the indent to place at the start of each line. + """ + + pieces = input_str.split(' ') + + count = 0 + lines = [' ' * indent,] + for piece in pieces: + if len(lines[count]) + len(piece) > ref_len: + lines[count] += ' &' + lines += [' ' * indent + pragma + ' &',] + count += 1 + + lines[count] += (piece + ' ') + + return lines + + +def _format_lines(input_str, indent=0, width=132, pragma=''): + """ + Add a specified indent to an input string and wrap it across multiple lines if it + exceeds the specified width. + + Parameters + ---------- + input_str : str + The input string to format. + indent : int + The size of the indent to apply before the line. + width : int + The maximum allowed length of a line. + pragma : str + A pragma keyword to prepend to a new line. + """ + + _wrapped_lines = [] + if isinstance(input_str, (list, tuple)): + for s in input_str: + _wrapped_lines += _format_lines(s, indent=indent, pragma=pragma) + else: + ref_len = width - indent - 2 - len(pragma) + if len(input_str) > width - indent: + _wrapped_lines += _wrap_lines(input_str, ref_len, indent=indent, pragma=pragma) + else: + _wrapped_lines += [' ' * indent + input_str,] + + return '\n'.join(_wrapped_lines) + +def _get_offload_backend(): + """ + Determine the specific offload backend to be used. + """ + + optparser = fypp.get_option_parser() + options, _ = optparser.parse_args() + + offload_model = [opt for opt in options.defines if 'OFFLOAD_MODEL' in opt][0] + offload_model = offload_model.split('=')[-1].replace('"', '') + + return _offload_map[offload_model] + +def RuntimeApiImport(indent=0): + """ + Import the runtime API. + """ + + backend = _get_offload_backend() + + return _format_lines(backend.runtime_api_import()) + +def CDevptrDecl(symbols, indent=0): + """ + Declare symbols of type `TYPE(C_DEVPTR)` (or equivalent). + """ + + backend = _get_offload_backend() + + return _format_lines(backend.c_devptr_declaration(symbols)) + +def HostDataStart(symbols, indent=0): + """ + Start a `host_data` (or equivalent) region. + """ + + backend = _get_offload_backend() + return _format_lines(backend.host_data_start(symbols), indent=indent, pragma=backend.pragma) + +def HostDataEnd(indent=0): + """ + End a `host_data` (or equivalent) region. + """ + + backend = _get_offload_backend() + return _format_lines(backend.host_data_end(), indent=indent) + +def DevptrCLOC(symbol, indent=0): + """ + Get the C address of a device variable. + """ + + backend = _get_offload_backend() + return _format_lines(backend.devptr_c_loc(symbol)) + +def CopyToDevice1D(dev, host, size, indent=0): + """ + Copy a contiguous section of data from host to device. + """ + + backend = _get_offload_backend() + return _format_lines(backend.copy_to_device_1D(dev, host, size), indent=indent) + +def CopyToDevice1DAsync(dev, host, size, queue, indent=0): + """ + Asynchronously copy a contiguous section of data from host to device. + """ + + backend = _get_offload_backend() + return _format_lines(backend.copy_to_device_1D_async(dev, host, size, queue), indent=indent) + +def CopyFromDevice1D(dev, host, size, indent=0): + """ + Copy a contiguous section of data from device to host. + """ + + backend = _get_offload_backend() + return _format_lines(backend.copy_from_device_1D(dev, host, size), indent=indent) + +def CopyFromDevice1DAsync(dev, host, size, queue, indent=0): + """ + Asynchronously copy a contiguous section of data from device to host. + """ + + backend = _get_offload_backend() + return _format_lines(backend.copy_from_device_1D_async(dev, host, size, queue), indent=indent) diff --git a/src/core/field_RANKSUFF_data_module.fypp b/src/core/field_RANKSUFF_data_module.fypp index 6bc409c..f6d95f9 100644 --- a/src/core/field_RANKSUFF_data_module.fypp +++ b/src/core/field_RANKSUFF_data_module.fypp @@ -112,7 +112,7 @@ CONTAINS #:for d in range (0, ft.rank+1) SUBROUTINE ${ftn}$_COPY_DIM${d}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) #ifdef _OPENACC - USE OPENACC +$:offload_macros.RuntimeApiImport(indent=2) #endif USE, INTRINSIC :: ISO_FORTRAN_ENV, ONLY : INT64 ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) @@ -122,21 +122,21 @@ CONTAINS INTEGER (KIND=INT64) :: ISIZE INTEGER :: ${', '.join (['J'] + list (map (lambda i: 'J' + str (i+1), range (d, ft.rank))))}$ #ifdef _OPENACC - TYPE(C_DEVPTR) :: DEVPTR +$:offload_macros.CDevptrDecl(symbols=['DEVPTR'], indent=4) #endif #:for e in range (ft.rank, d, -1) ${' ' * (ft.rank - e)}$DO J${e}$ = LBOUND (HST, ${e}$), UBOUND (HST, ${e}$) #:endfor #:set ar = ', '.join ([':'] * d + list (map (lambda i: 'J' + str (i+1), range (d, ft.rank)))) - #:set lbdiff = lambda i: f'LBOUND(DEV, {i}) - LBOUND (HST, {i})' + #:set lbdiff = lambda i: f'LBOUND(DEV,{i}) - LBOUND (HST,{i})' #:set ard = ', '.join ([':'] * d + ['J' + str(i+1) + ' + ' + lbdiff(i+1) for i in range (d, ft.rank)]) #:set indent = ' ' * (ft.rank - e) #ifdef _OPENACC ${indent}$ IF(MAP_DEVPTR)THEN - ${indent}$ !$acc host_data use_device(DEV) - ${indent}$ DEVPTR = C_DEVLOC(DEV (${ard}$)) - ${indent}$ !$acc end host_data +$:offload_macros.HostDataStart(symbols=[f'DEV ({ard})'], indent=ft.rank - e) + ${indent}$ DEVPTR = ${offload_macros.DevptrCLOC(f'DEV ({ard})')}$ +$:offload_macros.HostDataEnd(indent=ft.rank - e) ${indent}$ ELSE ${indent}$ !$acc data deviceptr(DEVPTR, DEV) ${indent}$ DEVPTR = C_DEVLOC(DEV (${ard}$)) @@ -151,9 +151,9 @@ CONTAINS ${indent}$ IF (KDIR == NH2D) THEN #ifdef _OPENACC ${indent}$ IF(PRESENT(QUEUE))THEN - ${indent}$ CALL ACC_MEMCPY_TO_DEVICE_ASYNC (DEVPTR , HST (${ar}$), ISIZE, QUEUE) +$:offload_macros.CopyToDevice1DAsync(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', queue='QUEUE', indent=ft.rank - e + 10) ${indent}$ ELSE - ${indent}$ CALL ACC_MEMCPY_TO_DEVICE (DEVPTR , HST (${ar}$), ISIZE) +$:offload_macros.CopyToDevice1D(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', indent=ft.rank - e + 10) ${indent}$ ENDIF #else ${indent}$ DEV (${ard}$) = HST (${ar}$) @@ -161,9 +161,9 @@ CONTAINS ${indent}$ ELSEIF (KDIR == ND2H) THEN #ifdef _OPENACC ${indent}$ IF(PRESENT(QUEUE))THEN - ${indent}$ CALL ACC_MEMCPY_FROM_DEVICE_ASYNC (HST (${ar}$), DEVPTR, ISIZE, QUEUE) +$:offload_macros.CopyFromDevice1DAsync(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', queue='QUEUE', indent=ft.rank - e + 10) ${indent}$ ELSE - ${indent}$ CALL ACC_MEMCPY_FROM_DEVICE (HST (${ar}$), DEVPTR, ISIZE) +$:offload_macros.CopyFromDevice1D(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', indent=ft.rank - e + 10) ${indent}$ ENDIF #else ${indent}$ HST (${ar}$) = DEV (${ard}$) From f49858f668a2542972839c787458fca2d5176017 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Tue, 28 Jan 2025 18:46:16 +0000 Subject: [PATCH 05/10] WIP: python macros adapted for host only functionality --- cmake/field_api_get_offload_model.cmake | 2 +- python_utils/offload_backends/__init__.py | 1 + python_utils/offload_backends/host_only.py | 18 ++++++++ python_utils/offload_macros.py | 53 +++++++++++++++++----- 4 files changed, 62 insertions(+), 12 deletions(-) create mode 100644 python_utils/offload_backends/host_only.py diff --git a/cmake/field_api_get_offload_model.cmake b/cmake/field_api_get_offload_model.cmake index 62a4c65..afaca7f 100644 --- a/cmake/field_api_get_offload_model.cmake +++ b/cmake/field_api_get_offload_model.cmake @@ -47,7 +47,7 @@ macro( field_api_get_offload_model ) DESCRIPTION "CUDA" DEFAULT ON CONDITION CMAKE_CUDA_COMPILER AND HAVE_ACC ) - set(FIELD_API_OFFLOAD_MODEL "None") + set(FIELD_API_OFFLOAD_MODEL "HostOnly") if( HAVE_CUDA ) set(FIELD_API_OFFLOAD_MODEL "NVHPCOpenACCCUDA") elseif( HAVE_ACC ) diff --git a/python_utils/offload_backends/__init__.py b/python_utils/offload_backends/__init__.py index bdb1034..6177551 100644 --- a/python_utils/offload_backends/__init__.py +++ b/python_utils/offload_backends/__init__.py @@ -8,3 +8,4 @@ # nor does it submit to any jurisdiction. from offload_backends.nvhpc import * +from offload_backends.host_only import * diff --git a/python_utils/offload_backends/host_only.py b/python_utils/offload_backends/host_only.py new file mode 100644 index 0000000..51d003b --- /dev/null +++ b/python_utils/offload_backends/host_only.py @@ -0,0 +1,18 @@ +# (C) Copyright 2022- ECMWF. +# (C) Copyright 2022- Meteo-France. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + + +__all__ = ['HostOnly'] + +class HostOnly(): + """ + A dummy class only to be used if GPU offload is disabled. + """ + + pragma = '' diff --git a/python_utils/offload_macros.py b/python_utils/offload_macros.py index 3de9888..9fc310d 100644 --- a/python_utils/offload_macros.py +++ b/python_utils/offload_macros.py @@ -8,14 +8,15 @@ # nor does it submit to any jurisdiction. import fypp -from offload_backends import NVHPCOpenACC +from offload_backends import NVHPCOpenACC, HostOnly """ A common entry point for retrieving macros from the various GPU offload backends. """ _offload_map = { - 'NVHPCOpenACC': NVHPCOpenACC + 'NVHPCOpenACC': NVHPCOpenACC, + 'HostOnly': HostOnly } def _wrap_lines(input_str, ref_len, pragma='', indent=0): @@ -92,14 +93,29 @@ def _get_offload_backend(): return _offload_map[offload_model] +def _empty_string(*args, **kwargs): + """Simple method to return an empty string.""" + return "" + +def _get_method(backend, method): + """ + Retrieve the appropriate method from the given backend. + """ + + try: + return getattr(backend, method) + except AttributeError: + return _empty_string + def RuntimeApiImport(indent=0): """ Import the runtime API. """ backend = _get_offload_backend() + method = _get_method(backend, 'runtime_api_import') - return _format_lines(backend.runtime_api_import()) + return _format_lines(method()) def CDevptrDecl(symbols, indent=0): """ @@ -107,8 +123,9 @@ def CDevptrDecl(symbols, indent=0): """ backend = _get_offload_backend() + method = _get_method(backend, 'c_devptr_declaration') - return _format_lines(backend.c_devptr_declaration(symbols)) + return _format_lines(method(symbols)) def HostDataStart(symbols, indent=0): """ @@ -116,7 +133,9 @@ def HostDataStart(symbols, indent=0): """ backend = _get_offload_backend() - return _format_lines(backend.host_data_start(symbols), indent=indent, pragma=backend.pragma) + method = _get_method(backend, 'host_data_start') + + return _format_lines(method(symbols), indent=indent, pragma=backend.pragma) def HostDataEnd(indent=0): """ @@ -124,7 +143,9 @@ def HostDataEnd(indent=0): """ backend = _get_offload_backend() - return _format_lines(backend.host_data_end(), indent=indent) + method = _get_method(backend, 'host_data_end') + + return _format_lines(method(), indent=indent) def DevptrCLOC(symbol, indent=0): """ @@ -132,7 +153,9 @@ def DevptrCLOC(symbol, indent=0): """ backend = _get_offload_backend() - return _format_lines(backend.devptr_c_loc(symbol)) + method = _get_method(backend, 'devptr_c_loc') + + return _format_lines(method(symbol)) def CopyToDevice1D(dev, host, size, indent=0): """ @@ -140,7 +163,9 @@ def CopyToDevice1D(dev, host, size, indent=0): """ backend = _get_offload_backend() - return _format_lines(backend.copy_to_device_1D(dev, host, size), indent=indent) + method = _get_method(backend, 'copy_to_device_1D') + + return _format_lines(method(dev, host, size), indent=indent) def CopyToDevice1DAsync(dev, host, size, queue, indent=0): """ @@ -148,7 +173,9 @@ def CopyToDevice1DAsync(dev, host, size, queue, indent=0): """ backend = _get_offload_backend() - return _format_lines(backend.copy_to_device_1D_async(dev, host, size, queue), indent=indent) + method = _get_method(backend, 'copy_to_device_1D_async') + + return _format_lines(method(dev, host, size, queue), indent=indent) def CopyFromDevice1D(dev, host, size, indent=0): """ @@ -156,7 +183,9 @@ def CopyFromDevice1D(dev, host, size, indent=0): """ backend = _get_offload_backend() - return _format_lines(backend.copy_from_device_1D(dev, host, size), indent=indent) + method = _get_method(backend, 'copy_from_device_1D') + + return _format_lines(method(dev, host, size), indent=indent) def CopyFromDevice1DAsync(dev, host, size, queue, indent=0): """ @@ -164,4 +193,6 @@ def CopyFromDevice1DAsync(dev, host, size, queue, indent=0): """ backend = _get_offload_backend() - return _format_lines(backend.copy_from_device_1D_async(dev, host, size, queue), indent=indent) + method = _get_method(backend, 'copy_from_device_1D_async') + + return _format_lines(method(dev, host, size, queue), indent=indent) From 249974ed423a9abbe744b9619545a647fe02335d Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Wed, 29 Jan 2025 16:18:55 +0000 Subject: [PATCH 06/10] FIELD_ASYNC: convert to fypp file --- src/core/CMakeLists.txt | 3 +-- src/core/{field_async_module.F90 => field_async_module.fypp} | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) rename src/core/{field_async_module.F90 => field_async_module.fypp} (93%) diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index d2243fa..9d4e3fb 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -8,7 +8,6 @@ # nor does it submit to any jurisdiction. list(APPEND srcs - field_async_module.F90 field_basic_module.F90 field_defaults_module.F90 dev_alloc.c @@ -17,7 +16,7 @@ list(APPEND srcs field_abort_module.F90) list( APPEND ranksuff_srcs _data) -list( APPEND non_ranksuff_srcs dev_alloc_module field_module host_alloc_module ) +list( APPEND non_ranksuff_srcs dev_alloc_module field_module host_alloc_module field_async_module ) ## expand ranksuff sources field_api_expand_fypp_ranksuff( diff --git a/src/core/field_async_module.F90 b/src/core/field_async_module.fypp similarity index 93% rename from src/core/field_async_module.F90 rename to src/core/field_async_module.fypp index a7bf2ca..4b8b365 100644 --- a/src/core/field_async_module.F90 +++ b/src/core/field_async_module.fypp @@ -19,7 +19,7 @@ SUBROUTINE WAIT_FOR_ASYNC_QUEUE (QUEUE) INTEGER(KIND=JPIM), INTENT(IN) :: QUEUE !Wait for all data transfer initiated on queue by the current thread -!$acc wait (QUEUE) +$:offload_macros.WaitAsyncStream(stream='QUEUE') END SUBROUTINE WAIT_FOR_ASYNC_QUEUE END MODULE FIELD_ASYNC_MODULE From 3062bd835f56b2021f9634f6262c49cafa94019b Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Wed, 29 Jan 2025 16:19:29 +0000 Subject: [PATCH 07/10] Implement NVHPCOpenaCC entirely using python macros --- cmake/field_api_expand_fypp.cmake | 4 +- .../offload_backends/nvhpc/openacc.py | 161 ++++++++++++++++++ python_utils/offload_macros.py | 155 +++++++++++++++++ src/core/dev_alloc_module.fypp | 8 +- src/core/field_RANKSUFF_module.fypp | 8 +- .../field_RANKSUFF_shuffle_module.fypp | 8 +- src/util/field_RANKSUFF_access_module.fypp | 2 +- .../field_RANKSUFF_array_util_module.fypp | 10 +- src/util/field_RANKSUFF_util_module.fypp | 4 +- 9 files changed, 339 insertions(+), 21 deletions(-) diff --git a/cmake/field_api_expand_fypp.cmake b/cmake/field_api_expand_fypp.cmake index 24e820b..2232382 100644 --- a/cmake/field_api_expand_fypp.cmake +++ b/cmake/field_api_expand_fypp.cmake @@ -41,7 +41,9 @@ macro( field_api_expand_fypp ) foreach (SRC ${_PAR_INPUT_SRCS} ) add_custom_command (OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${SRC}.F90 - COMMAND ${FYPP} -m os ${fypp_defines} -M ${_PAR_PYTHON_MODULE_DIR} -m fieldType ${_PAR_SOURCE_DIR}/${SRC}.fypp > ${CMAKE_CURRENT_BINARY_DIR}/${SRC}.F90 + COMMAND ${FYPP} -m os ${fypp_defines} -M ${_PAR_PYTHON_MODULE_DIR} -m fieldType + -DOFFLOAD_MODEL="${FIELD_API_OFFLOAD_MODEL}" -M ${_PAR_PYTHON_MODULE_DIR} -m offload_macros + ${_PAR_SOURCE_DIR}/${SRC}.fypp > ${CMAKE_CURRENT_BINARY_DIR}/${SRC}.F90 DEPENDS ${_PAR_SOURCE_DIR}/${SRC}.fypp VERBATIM) diff --git a/python_utils/offload_backends/nvhpc/openacc.py b/python_utils/offload_backends/nvhpc/openacc.py index b129ed7..a4b5c74 100644 --- a/python_utils/offload_backends/nvhpc/openacc.py +++ b/python_utils/offload_backends/nvhpc/openacc.py @@ -17,6 +17,9 @@ class NVHPCOpenACC(): """ pragma = '!$acc' + _data_attributes = ['private', 'copy', 'copyout', 'copyin', 'present', 'deviceptr', 'create'] + _loop_attributes = ['gang', 'vector', 'worker', 'seq'] + _declare_attributes = ['create', 'device_resident', 'deviceptr'] @classmethod def runtime_api_import(cls): @@ -89,3 +92,161 @@ def copy_from_device_1D_async(cls, dev, host, size, queue): """ return f"CALL ACC_MEMCPY_FROM_DEVICE_ASYNC ({host}, {dev}, {size}, {queue})" + + @classmethod + def host_mapped_dev_alloc(cls, data): + """ + Allocate host-mapped memory on device. + """ + + return f"!$acc enter data create ({','.join(data)})" + + @classmethod + def host_mapped_dev_free(cls, data): + """ + Free host-mapped memory on device. + """ + + return f"!$acc exit data delete ({','.join(data)})" + + @classmethod + def attach_dev_ptr(cls, ptr): + """ + Attach device pointer to its target on device. + """ + + return f"!$acc enter data attach ({ptr})" + + @classmethod + def detach_dev_ptr(cls, ptr): + """ + Detach device pointer from its target on device. + """ + + return f"!$acc exit data detach ({ptr})" + + @classmethod + def launch_kernel(cls, **kwargs): + """ + Launch an implicitly mapped parallel kernel on device. + """ + + _data_spec = "" + for attr in cls._data_attributes: + decl = kwargs.get(attr, None) + if decl: + _data_spec += f"{attr}({','.join(decl)}) " + + return f"!$acc kernels {_data_spec}" + + @classmethod + def end_kernel(cls): + """ + End an implicitly mapped parallel kernel on device. + """ + + return "!$acc end kernels" + + @classmethod + def async_wait(cls, stream): + """ + Wait for the operations queued on a stream to complete. + """ + + return f"!$acc wait ({stream})" + + @classmethod + def launch_parallel_loop(cls, **kwargs): + """ + Launch an explicitly mapped parallel kernel on device. + """ + + _loop_spec = "" + for attr in cls._loop_attributes: + if kwargs.get(attr, None): + _loop_spec += f"{attr} " + + for attr in cls._data_attributes: + decl = kwargs.get(attr, None) + if decl: + _loop_spec += f"{attr}({','.join(decl)}) " + + return f"!$acc parallel loop {_loop_spec}" + + @classmethod + def end_parallel_loop(cls): + """ + End an explicitly mapped parallel kernel on device. + """ + + return "!$acc end parallel loop" + + @classmethod + def annotate_parallel_loop(cls, **kwargs): + """ + Annotate a loop in a device parallel region. + """ + + _loop_spec = "" + for attr in cls._loop_attributes: + if kwargs.get(attr, None): + _loop_spec += f"{attr} " + + for attr in cls._data_attributes: + decl = kwargs.get(attr, None) + if decl: + _loop_spec += f"{attr}({','.join(decl)}) " + + return f"!$acc loop {_loop_spec}" + + @classmethod + def declare(cls, **kwargs): + """ + Issue a device declaration for a host-mapped symbol. + """ + + _decl_spec = "" + for attr in cls._declare_attributes: + decl = kwargs.get(attr, None) + if decl: + _decl_spec += f"{attr}({','.join(decl)}) " + + return f"!$acc declare {_decl_spec}" + + @classmethod + def launch_serial_kernel(cls, **kwargs): + """ + Launch a serial kernel on device. + """ + + _data_spec = "" + for attr in cls._data_attributes: + decl = kwargs.get(attr, None) + if decl: + _data_spec += f"{attr}({','.join(decl)}) " + + return f"!$acc serial {_data_spec}" + + @classmethod + def end_serial_kernel(cls): + """ + End a serial device kernel. + """ + + return "!$acc end serial" + + @classmethod + def update_device(cls, data): + """ + Update host-mapped symbol on device. + """ + + return f"!$acc update device ({','.join(data)})" + + @classmethod + def update_host(cls, data): + """ + Update device-mapped symbol on host. + """ + + return f"!$acc update self ({','.join(data)})" diff --git a/python_utils/offload_macros.py b/python_utils/offload_macros.py index 9fc310d..0fc6506 100644 --- a/python_utils/offload_macros.py +++ b/python_utils/offload_macros.py @@ -196,3 +196,158 @@ def CopyFromDevice1DAsync(dev, host, size, queue, indent=0): method = _get_method(backend, 'copy_from_device_1D_async') return _format_lines(method(dev, host, size, queue), indent=indent) + +def HostMappedDevAlloc(data, indent=0): + """ + Allocate host-mapped memory on device. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'host_mapped_dev_alloc') + + return _format_lines(method(data), indent=indent) + +def HostMappedDevFree(data, indent=0): + """ + Free host-mapped memory on device. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'host_mapped_dev_free') + + return _format_lines(method(data), indent=indent) + +def AttachDevPtr(ptr, indent=0): + """ + Attach a device pointer to its target. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'attach_dev_ptr') + + return _format_lines(method(ptr), indent=indent) + +def DetachDevPtr(ptr, indent=0): + """ + Detach a device pointer from its target. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'detach_dev_ptr') + + return _format_lines(method(ptr), indent=indent) + +def LaunchParallelKernel(**kwargs): + """ + Launch an implicitly mapped parallel kernel on device. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'launch_kernel') + + indent = kwargs.pop('indent', 0) + return _format_lines(method(**kwargs), indent=indent) + +def EndParallelKernel(indent=0): + """ + End an implicitly mapped parallel kernel on device. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'end_kernel') + + return _format_lines(method(), indent=indent) + +def WaitAsyncStream(stream, indent=0): + """ + Wait for the operations queued on a stream to complete. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'async_wait') + + return _format_lines(method(stream), indent=indent) + +def LaunchParallelLoop(**kwargs): + """ + Launch an explicitly mapped parallel kernel on device. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'launch_parallel_loop') + + indent = kwargs.pop('indent', 0) + return _format_lines(method(**kwargs), indent=indent) + +def EndParallelLoop(indent=0): + """ + End an explicitly mapped parallel kernel on device. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'end_parallel_loop') + + return _format_lines(method(), indent=indent) + +def AnnotateParallelLoop(**kwargs): + """ + Annotate a loop in a device parallel region. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'annotate_parallel_loop') + + indent = kwargs.pop('indent', 0) + return _format_lines(method(**kwargs), indent=indent) + +def Declare(**kwargs): + """ + Issue a device declaration for a host-mapped symbol. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'declare') + + indent = kwargs.pop('indent', 0) + return _format_lines(method(**kwargs), indent=indent) + +def LaunchSerialKernel(**kwargs): + """ + Launch a serial kernel on device. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'launch_serial_kernel') + + indent = kwargs.pop('indent', 0) + return _format_lines(method(**kwargs), indent=indent) + +def EndSerialKernel(indent=0): + """ + End a serial device kernel. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'end_serial_kernel') + + return _format_lines(method(), indent=indent) + +def UpdateDevice(data, indent=0): + """ + Update host-mapped symbol on device. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'update_device') + + return _format_lines(method(data), indent=indent) + +def UpdateHost(data, indent=0): + """ + Update device-mapped symbol on host. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'update_host') + + return _format_lines(method(data), indent=indent) diff --git a/src/core/dev_alloc_module.fypp b/src/core/dev_alloc_module.fypp index c86bda0..f7db5a8 100644 --- a/src/core/dev_alloc_module.fypp +++ b/src/core/dev_alloc_module.fypp @@ -125,7 +125,7 @@ CALL C_F_POINTER (PTR, TMP, UBOUNDS-ILBOUNDS+1) DEV (${ ', '.join (map (lambda i: 'ILBOUNDS (' + str (i) + '):', range (1, ft.rank+1))) }$) => TMP IF(MAP_DEVPTR)THEN -!$acc enter data create (DEV) +$:offload_macros.HostMappedDevAlloc(data=['DEV',]) ENDIF IF (FIELD_STATISTICS_ENABLE) CALL FIELD_STATISTICS_DEVICE_ALLOCATE (SIZE (DEV, KIND=JPIB) * INT (KIND (DEV), KIND=JPIB)) @@ -151,7 +151,7 @@ IF (ASSOCIATED (DEV)) THEN IF(MAP_DEVPTR)THEN CALL DEV_FREE (PTR) - !$acc exit data delete (DEV) +$:offload_macros.HostMappedDevFree(data=['DEV',], indent=4) ELSE #:if defined('CUDA') ISTAT = CUDA_FREE (PTR) @@ -181,7 +181,7 @@ ALLOCATE (DEV (${ ', '.join (map (lambda i: 'LBOUND (HST, ' + str (i) + '):UBOUN ALLOCATE (DEV, MOLD=HST) #endif -!$acc enter data create (DEV) +$:offload_macros.HostMappedDevAlloc(data=['DEV',]) IF (FIELD_STATISTICS_ENABLE) CALL FIELD_STATISTICS_DEVICE_ALLOCATE (SIZE (DEV, KIND=JPIB) * INT (KIND (DEV), KIND=JPIB)) @@ -198,7 +198,7 @@ IF (ASSOCIATED (DEV)) THEN IF (FIELD_STATISTICS_ENABLE) CALL FIELD_STATISTICS_DEVICE_DEALLOCATE (SIZE (DEV, KIND=JPIB) * INT (KIND (DEV), KIND=JPIB)) - !$acc exit data delete (DEV) +$:offload_macros.HostMappedDevFree(data=['DEV',], indent=2) DEALLOCATE (DEV) NULLIFY (DEV) ENDIF diff --git a/src/core/field_RANKSUFF_module.fypp b/src/core/field_RANKSUFF_module.fypp index 3644404..c3308ba 100644 --- a/src/core/field_RANKSUFF_module.fypp +++ b/src/core/field_RANKSUFF_module.fypp @@ -392,7 +392,7 @@ CONTAINS SELF%LOBJECT_COPIED = .TRUE. #ifdef _OPENACC IF (ASSOCIATED (SELF%DEVPTR)) THEN - !$acc enter data attach (SELF%DEVPTR) +$:offload_macros.AttachDevPtr(ptr='SELF%DEVPTR', indent=6) ENDIF #endif ENDIF @@ -416,7 +416,7 @@ CONTAINS SELF%LOBJECT_COPIED = .FALSE. #ifdef _OPENACC IF (ASSOCIATED (SELF%DEVPTR)) THEN - !$acc exit data detach (SELF%DEVPTR) +$:offload_macros.DetachDevPtr(ptr='SELF%DEVPTR', indent=6) ENDIF #endif ENDIF @@ -561,9 +561,9 @@ CONTAINS ELSEIF (IAND (SELF%GET_STATUS (), NDEVFRESH) /= 0) THEN CALL SELF%GET_DEVICE_DATA_RDONLY (PTR) ALLOCATE (ZZ, MOLD=PTR) -!$acc kernels present (PTR) copyout (ZZ) +$:offload_macros.LaunchParallelKernel(present=['PTR',], copyout=['ZZ',]) ZZ = PTR -!$acc end kernels +$:offload_macros.EndParallelKernel() ILEN = SIZE (ZZ) * KIND (ZZ) CALL CRC64 (ZZ, ILEN, ICRC) ENDIF diff --git a/src/shuffle/field_RANKSUFF_shuffle_module.fypp b/src/shuffle/field_RANKSUFF_shuffle_module.fypp index c76c0de..8a7d612 100644 --- a/src/shuffle/field_RANKSUFF_shuffle_module.fypp +++ b/src/shuffle/field_RANKSUFF_shuffle_module.fypp @@ -158,13 +158,13 @@ INTEGER (KIND=JPIM) :: ${ind}$ #:endif #:if what == 'DEVICE' -!$acc parallel loop gang present (PTRG, PTRS, KNDS) +$:offload_macros.LaunchParallelLoop(gang=True, present=['PTRG', 'PTRS', 'KNDS']) #:elif what == 'HOST' !$OMP PARALLEL DO PRIVATE (${ind}$JBLKG, JLONG, JBLKS, JLONS) #:endif DO JBLKG = 1, SIZE (KNDS, 3) #:if what == 'DEVICE' -!$acc loop vector private (${ind}$JLONG, JBLKS, JLONS) +$:offload_macros.AnnotateParallelLoop(vector=True, private=[f'{ind}JLONG', 'JBLKS', 'JLONS']) #:endif DO JLONG = 1, SIZE (KNDS, 2) JLONS = KNDS (NLONDIM, JLONG, JBLKG) @@ -253,13 +253,13 @@ INTEGER (KIND=JPIM) :: ${ind}$ #:endif #:if what == 'DEVICE' -!$acc parallel loop gang present (PTRG, PTRS, KNDS) +$:offload_macros.LaunchParallelLoop(gang=True, present=['PTRG', 'PTRS', 'KNDS']) #:elif what == 'HOST' !$OMP PARALLEL DO PRIVATE (${ind}$JBLKG, JLONG, JBLKS, JLONS) #:endif DO JBLKG = 1, SIZE (KNDS, 3) #:if what == 'DEVICE' -!$acc loop vector private (${ind}$JLONG, JBLKS, JLONS) +$:offload_macros.AnnotateParallelLoop(vector=True, private=[f'{ind}JLONG', 'JBLKS', 'JLONS']) #:endif DO JLONG = 1, SIZE (KNDS, 2) JLONS = KNDS (NLONDIM, JLONG, JBLKG) diff --git a/src/util/field_RANKSUFF_access_module.fypp b/src/util/field_RANKSUFF_access_module.fypp index 4d0b30f..ca2b78d 100644 --- a/src/util/field_RANKSUFF_access_module.fypp +++ b/src/util/field_RANKSUFF_access_module.fypp @@ -44,7 +44,7 @@ PUBLIC :: GET_${what}$_DATA_${mode}$ #:for ft in fieldTypeList ${ft.type}$, TARGET, SAVE :: DUMMY_${ft.name}$ (${ ', '.join ([dumsize] * (ft.rank-1) + ['1']) }$) -!$acc declare create (DUMMY_${ft.name}$) +$:offload_macros.Declare(create=[f'DUMMY_{ft.name}',]) #:endfor diff --git a/src/util/field_RANKSUFF_array_util_module.fypp b/src/util/field_RANKSUFF_array_util_module.fypp index 29ab1d0..79a9aa3 100644 --- a/src/util/field_RANKSUFF_array_util_module.fypp +++ b/src/util/field_RANKSUFF_array_util_module.fypp @@ -66,14 +66,14 @@ LLCREATED = .FALSE. IF (PRESENT (LDCREATED)) LLCREATED = LDCREATED IF (.NOT. LLCREATED) THEN - !$acc enter data create (SELF) - !$acc update device (SELF) +$:offload_macros.HostMappedDevAlloc(data=['SELF',], indent=2) +$:offload_macros.UpdateDevice(data=['SELF',], indent=2) ENDIF -!$acc serial present (SELF) +$:offload_macros.LaunchSerialKernel(present=['SELF',]) NULLIFY (SELF%P) NULLIFY (SELF%F_P) -!$acc end serial +$:offload_macros.EndSerialKernel() IF (LLFIELDAPI .AND. ASSOCIATED (SELF%F_P)) THEN CALL COPY (SELF%F_P, LDCREATED) @@ -100,7 +100,7 @@ IF (LLFIELDAPI .AND. ASSOCIATED (SELF%F_P)) THEN ENDIF IF (.NOT. LLDELETED) THEN - !$acc exit data delete (SELF) +$:offload_macros.HostMappedDevFree(data=['SELF',], indent=2) ENDIF END SUBROUTINE diff --git a/src/util/field_RANKSUFF_util_module.fypp b/src/util/field_RANKSUFF_util_module.fypp index 7216024..541620c 100644 --- a/src/util/field_RANKSUFF_util_module.fypp +++ b/src/util/field_RANKSUFF_util_module.fypp @@ -162,9 +162,9 @@ SUBROUTINE LEGACY_${ft.name}$_ASSIGN (PTR_RHS, PTR_LHS) ${ft.type}$ :: PTR_RHS (${ft.shape}$), PTR_LHS (${ft.shape}$) -!$acc kernels present (PTR_RHS, PTR_LHS) +$:offload_macros.LaunchParallelKernel(present=['PTR_RHS', 'PTR_LHS']) PTR_RHS = PTR_LHS -!$acc end kernels +$:offload_macros.EndParallelKernel() END SUBROUTINE From 04188f566a599ac40bfa75a769f80d28c4dac4a6 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Fri, 31 Jan 2025 22:00:26 +0000 Subject: [PATCH 08/10] Adapt CUDA backend to use python macros --- .../offload_backends/nvhpc/__init__.py | 1 + .../offload_backends/nvhpc/openacc.py | 23 +++ .../offload_backends/nvhpc/openacc_cuda.py | 181 +++++++++++++++++ python_utils/offload_macros.py | 184 +++++++++++++++++- src/core/dev_alloc_module.fypp | 19 +- src/core/field_RANKSUFF_data_module.fypp | 58 +++--- src/core/host_alloc_module.fypp | 61 +++--- 7 files changed, 439 insertions(+), 88 deletions(-) create mode 100644 python_utils/offload_backends/nvhpc/openacc_cuda.py diff --git a/python_utils/offload_backends/nvhpc/__init__.py b/python_utils/offload_backends/nvhpc/__init__.py index 943bffd..8dd0379 100644 --- a/python_utils/offload_backends/nvhpc/__init__.py +++ b/python_utils/offload_backends/nvhpc/__init__.py @@ -8,3 +8,4 @@ # nor does it submit to any jurisdiction. from offload_backends.nvhpc.openacc import * +from offload_backends.nvhpc.openacc_cuda import * diff --git a/python_utils/offload_backends/nvhpc/openacc.py b/python_utils/offload_backends/nvhpc/openacc.py index a4b5c74..8830075 100644 --- a/python_utils/offload_backends/nvhpc/openacc.py +++ b/python_utils/offload_backends/nvhpc/openacc.py @@ -250,3 +250,26 @@ def update_host(cls, data): """ return f"!$acc update self ({','.join(data)})" + + @classmethod + def data_start(cls, **kwargs): + """ + Pragma to mark the start of a `data` region. + """ + + _data_spec = "" + for attr in cls._data_attributes: + decl = kwargs.get(attr, None) + if decl: + _data_spec += f"{attr}({','.join(decl)}) " + + return f"!$acc data {_data_spec}" + + @classmethod + def data_end(cls): + """ + Pragma to mark the end of a `data` region. + """ + + + return "!$acc end data" diff --git a/python_utils/offload_backends/nvhpc/openacc_cuda.py b/python_utils/offload_backends/nvhpc/openacc_cuda.py new file mode 100644 index 0000000..be6b890 --- /dev/null +++ b/python_utils/offload_backends/nvhpc/openacc_cuda.py @@ -0,0 +1,181 @@ +# (C) Copyright 2022- ECMWF. +# (C) Copyright 2022- Meteo-France. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. + + +__all__ = ['NVHPCOpenACCCUDA'] + +from offload_backends.nvhpc import NVHPCOpenACC + +class NVHPCOpenACCCUDA(NVHPCOpenACC): + """ + A class that defines the macros needed for GPU offload using Nvidia's + OpenACC implementation and CUDA runtime API. + """ + + @classmethod + def runtime_api_import(cls): + """ + Runtime API import. + """ + + _import = [super().runtime_api_import(),] + _import += ["USE CUDAFOR",] + + return _import + + @classmethod + def stream_handle_kind(cls): + """ + Return the INTEGER kind specifier for a stream handle. + """ + + return "CUDA_STREAM_KIND" + + @classmethod + def dev_malloc_intf(cls): + """ + The ISO_C interface for a device memory allocation. + """ + + intf = """ + INTEGER FUNCTION CUDA_MALLOC (PTR,SIZ) BIND (C, NAME='cudaMalloc') + IMPORT :: C_PTR, C_SIZE_T + INTEGER (C_SIZE_T), VALUE, INTENT(IN) :: SIZ + TYPE (C_PTR), INTENT(OUT) :: PTR + END FUNCTION CUDA_MALLOC + """ + + return intf.split('\n') + + @classmethod + def dev_free_intf(cls): + """ + The ISO_C interface for freeing device memory. + """ + + intf = """ + INTEGER FUNCTION CUDA_FREE (PTR) BIND (C, NAME='cudaFree') + IMPORT :: C_PTR + TYPE (C_PTR), VALUE, INTENT(IN) :: PTR + END FUNCTION CUDA_FREE + """ + + return intf.split('\n') + + @classmethod + def runtime_error_return_type(cls, symbols): + """ + Declaration for the variable used to store the runtime API error status. + """ + + return f"INTEGER :: {','.join(symbols)}" + + @classmethod + def dev_malloc(cls, ptr, size, return_val="ISTAT"): + """ + Allocate memory on device. + """ + + return f"{return_val} = CUDA_MALLOC({ptr}, {size})" + + @classmethod + def dev_free(cls, ptr, return_val="ISTAT"): + """ + Free device memory. + """ + + return f"{return_val} = CUDA_FREE({ptr})" + + @classmethod + def register_host(cls, ptr, size, flags, return_val="ISTAT"): + """ + Page-lock host memory. + """ + + return f"{return_val} = CUDA_HOST_REGISTER({ptr}, {size}, {flags})" + + @classmethod + def register_host_set_flags(cls, flag_var, val): + """ + Set flags used to control page-locking of host memory. + """ + + return f"{flag_var} = {val} !... Corresponds to cudaHostRegisterMapped" + + @classmethod + def register_host_decl_flags(cls, flag_var): + """ + Declare variable used to store flags for controlling page-locking of host memory. + """ + + return f"INTEGER(C_INT) :: {flag_var}" + + @classmethod + def unregister_host(cls, ptr, return_val="ISTAT"): + """ + Unpin (i.e. undo page-locking) host memory. + """ + + return f"{return_val} = CUDA_HOST_UNREGISTER({ptr})" + + @classmethod + def host_register_intf(cls): + """ + The ISO_C interface for page-locking host memory. + """ + + intf = """ + INTEGER FUNCTION CUDA_HOST_REGISTER (PTR, SIZ, FLAGS) BIND (C, NAME='cudaHostRegister') + IMPORT :: C_PTR, C_SIZE_T, C_INT + TYPE (C_PTR), VALUE, INTENT(IN) :: PTR + INTEGER (C_SIZE_T), VALUE, INTENT(IN) :: SIZ + INTEGER (C_INT), VALUE, INTENT(IN) :: FLAGS + END FUNCTION CUDA_HOST_REGISTER + """ + + return intf.split('\n') + + @classmethod + def host_unregister_intf(cls): + """ + The ISO_C interface for un-pinning (i.e. undo page-locking) host memory. + """ + + intf = """ + INTEGER FUNCTION CUDA_HOST_UNREGISTER (PTR) BIND (C, NAME='cudaHostUnregister') + IMPORT :: C_PTR + TYPE (C_PTR), VALUE, INTENT(IN) :: PTR + END FUNCTION CUDA_HOST_UNREGISTER + """ + + return intf.split('\n') + + @classmethod + def set_async_stream(cls, id, stream): + """ + Set an asynchronous stream. + """ + + return f"CALL ACC_SET_CUDA_STREAM({id}, {stream})" + + @classmethod + def copy_2D(cls, src, src_pitch, dst, dst_pitch, width, height, return_val="ISTAT"): + """ + Copy a strided memory region from source (src) to destination (dst). + """ + + return f"{return_val} = CUDAMEMCPY2D({dst}, {dst_pitch}, {src}, {src_pitch}, {width}, {height})" + + @classmethod + def copy_2D_async(cls, src, src_pitch, dst, dst_pitch, width, height, stream, return_val="ISTAT"): + """ + Asynchronously copy a strided memory region from source (src) to destination (dst). + """ + + return f"{return_val} = CUDAMEMCPY2DASYNC({dst}, {dst_pitch}, {src}, {src_pitch}, {width}, {height}, STREAM={stream})" diff --git a/python_utils/offload_macros.py b/python_utils/offload_macros.py index 0fc6506..31ddccf 100644 --- a/python_utils/offload_macros.py +++ b/python_utils/offload_macros.py @@ -8,7 +8,7 @@ # nor does it submit to any jurisdiction. import fypp -from offload_backends import NVHPCOpenACC, HostOnly +from offload_backends import NVHPCOpenACC, NVHPCOpenACCCUDA, HostOnly """ A common entry point for retrieving macros from the various GPU offload backends. @@ -16,7 +16,8 @@ _offload_map = { 'NVHPCOpenACC': NVHPCOpenACC, - 'HostOnly': HostOnly + 'HostOnly': HostOnly, + 'NVHPCOpenACCCUDA': NVHPCOpenACCCUDA } def _wrap_lines(input_str, ref_len, pragma='', indent=0): @@ -70,7 +71,7 @@ def _format_lines(input_str, indent=0, width=132, pragma=''): _wrapped_lines = [] if isinstance(input_str, (list, tuple)): for s in input_str: - _wrapped_lines += _format_lines(s, indent=indent, pragma=pragma) + _wrapped_lines += [_format_lines(s, indent=indent, pragma=pragma),] else: ref_len = width - indent - 2 - len(pragma) if len(input_str) > width - indent: @@ -115,7 +116,7 @@ def RuntimeApiImport(indent=0): backend = _get_offload_backend() method = _get_method(backend, 'runtime_api_import') - return _format_lines(method()) + return _format_lines(method(), indent=indent) def CDevptrDecl(symbols, indent=0): """ @@ -125,7 +126,7 @@ def CDevptrDecl(symbols, indent=0): backend = _get_offload_backend() method = _get_method(backend, 'c_devptr_declaration') - return _format_lines(method(symbols)) + return _format_lines(method(symbols), indent=indent) def HostDataStart(symbols, indent=0): """ @@ -351,3 +352,176 @@ def UpdateHost(data, indent=0): method = _get_method(backend, 'update_host') return _format_lines(method(data), indent=indent) + +def StreamHandleKind(): + """ + Return the INTEGER kind specifier for a stream handle. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'stream_handle_kind') + + return _format_lines(method()) + +def DataStart(**kwargs): + """ + Start a `data` (or equivalent) region. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'data_start') + + indent = kwargs.pop('indent', 0) + return _format_lines(method(**kwargs), indent=indent, pragma=backend.pragma) + +def DataEnd(indent=0): + """ + End a `data` (or equivalent) region. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'data_end') + + return _format_lines(method(), indent=indent) + +def DevMallocIntf(indent=0): + """ + The ISO_C interface for a device memory allocation. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'dev_malloc_intf') + + return _format_lines(method(), indent=indent) + +def DevFreeIntf(indent=0): + """ + The ISO_C interface for freeing device memory. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'dev_free_intf') + + return _format_lines(method(), indent=indent) + +def RuntimeErrorType(symbols, indent=0): + """ + Declaration for the variable used to store the runtime API error status. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'runtime_error_return_type') + + return _format_lines(method(symbols), indent=indent) + +def DevMalloc(ptr, size, return_val="ISTAT", indent=0): + """ + Allocate memory on device. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'dev_malloc') + + return _format_lines(method(ptr, size, return_val=return_val), indent=indent) + +def DevFree(ptr, return_val="ISTAT", indent=0): + """ + Free device memory. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'dev_free') + + return _format_lines(method(ptr, return_val=return_val), indent=indent) + +def RegisterHostSetFlags(flag_var, val, indent=0): + """ + Set flags for page-locking host memory. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'register_host_set_flags') + + return _format_lines(method(flag_var, val), indent=indent) + +def RegisterHostDeclFlags(flag_var, indent=0): + """ + Declare variable used to store flags for controlling page-locking of host memory. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'register_host_decl_flags') + + return _format_lines(method(flag_var), indent=indent) + +def RegisterHost(ptr, size, flags, return_val="ISTAT", indent=0): + """ + Page-lock host memory. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'register_host') + + return _format_lines(method(ptr, size, flags, return_val=return_val), indent=indent) + +def UnregisterHost(ptr, return_val="ISTAT", indent=0): + """ + Unpin (i.e. undo page-locking) host memory. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'unregister_host') + + return _format_lines(method(ptr, return_val=return_val), indent=indent) + +def HostRegisterIntf(indent=0): + """ + The ISO_C interface for page-locking host memory. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'host_register_intf') + + return _format_lines(method(), indent=indent) + +def HostUnregisterIntf(indent=0): + """ + The ISO_C interface for un-pinning (i.e. undo page-locking) host memory. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'host_unregister_intf') + + return _format_lines(method(), indent=indent) + +def SetAsyncStream(id, stream, indent=0): + """ + Set an asynchronous stream. + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'set_async_stream') + + return _format_lines(method(id, stream), indent=indent) + +def Copy2D(src, src_pitch, dst, dst_pitch, width, height, return_val="ISTAT", indent=0): + """ + Copy a strided memory region from source (src) to destination (dst). + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'copy_2D') + + return _format_lines(method(src, src_pitch, dst, dst_pitch, width, height, return_val=return_val), + indent=indent) + +def Copy2DAsync(src, src_pitch, dst, dst_pitch, width, height, stream, return_val="ISTAT", indent=0): + """ + Asynchronously copy a strided memory region from source (src) to destination (dst). + """ + + backend = _get_offload_backend() + method = _get_method(backend, 'copy_2D_async') + + return _format_lines(method(src, src_pitch, dst, dst_pitch, width, height, stream, return_val=return_val), + indent=indent) diff --git a/src/core/dev_alloc_module.fypp b/src/core/dev_alloc_module.fypp index f7db5a8..6ed08f7 100644 --- a/src/core/dev_alloc_module.fypp +++ b/src/core/dev_alloc_module.fypp @@ -53,15 +53,8 @@ END INTERFACE #:if defined('CUDA') INTERFACE - INTEGER FUNCTION CUDA_MALLOC (PTR,SIZ) BIND (C, NAME='cudaMalloc') - IMPORT :: C_PTR, C_SIZE_T - INTEGER (C_SIZE_T), VALUE, INTENT(IN) :: SIZ - TYPE (C_PTR), INTENT(OUT) :: PTR - END FUNCTION CUDA_MALLOC - INTEGER FUNCTION CUDA_FREE (PTR) BIND (C, NAME='cudaFree') - IMPORT :: C_PTR - TYPE (C_PTR), VALUE, INTENT(IN) :: PTR - END FUNCTION CUDA_FREE +$:offload_macros.DevMallocIntf(indent=2) +$:offload_macros.DevFreeIntf(indent=2) END INTERFACE #:endif @@ -102,7 +95,7 @@ LOGICAL, INTENT(IN) :: MAP_DEVPTR TYPE (C_PTR) :: PTR INTEGER (C_SIZE_T) :: SIZ #:if defined('CUDA') -INTEGER :: ISTAT +$:offload_macros.RuntimeErrorType(symbols=['ISTAT',]) #:endif ILBOUNDS = 1 @@ -117,7 +110,7 @@ IF(MAP_DEVPTR)THEN CALL DEV_MALLOC (SIZ, PTR) ELSE #:if defined('CUDA') - ISTAT = CUDA_MALLOC(PTR, SIZ) +$:offload_macros.DevMalloc(ptr='PTR', size='SIZ', return_val='ISTAT') #:endif ENDIF @@ -140,7 +133,7 @@ ${ft.type}$, POINTER :: DEV(${ft.shape}$) LOGICAL, INTENT(IN) :: MAP_DEVPTR TYPE (C_PTR) :: PTR #:if defined('CUDA') -INTEGER :: ISTAT +$:offload_macros.RuntimeErrorType(symbols=['ISTAT',]) #:endif IF (ASSOCIATED (DEV)) THEN @@ -154,7 +147,7 @@ IF (ASSOCIATED (DEV)) THEN $:offload_macros.HostMappedDevFree(data=['DEV',], indent=4) ELSE #:if defined('CUDA') - ISTAT = CUDA_FREE (PTR) +$:offload_macros.DevFree(ptr='PTR', return_val='ISTAT') #:endif ENDIF diff --git a/src/core/field_RANKSUFF_data_module.fypp b/src/core/field_RANKSUFF_data_module.fypp index f6d95f9..7dc0fda 100644 --- a/src/core/field_RANKSUFF_data_module.fypp +++ b/src/core/field_RANKSUFF_data_module.fypp @@ -138,9 +138,9 @@ $:offload_macros.HostDataStart(symbols=[f'DEV ({ard})'], indent=ft.rank - e) ${indent}$ DEVPTR = ${offload_macros.DevptrCLOC(f'DEV ({ard})')}$ $:offload_macros.HostDataEnd(indent=ft.rank - e) ${indent}$ ELSE - ${indent}$ !$acc data deviceptr(DEVPTR, DEV) - ${indent}$ DEVPTR = C_DEVLOC(DEV (${ard}$)) - ${indent}$ !$acc end data +$:offload_macros.DataStart(deviceptr=['DEVPTR', 'DEV'], indent=ft.rank - e) + ${indent}$ DEVPTR = ${offload_macros.DevptrCLOC(f'DEV ({ard})')}$ +$:offload_macros.DataEnd(indent=ft.rank - e) ${indent}$ ENDIF #endif #:if d == 0 @@ -180,22 +180,22 @@ $:offload_macros.CopyFromDevice1D(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE' #:for d1 in range (0, ft.rank) #:for d2 in range (d1+1, ft.rank+1) SUBROUTINE ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) - USE OPENACC - USE CUDAFOR +$:offload_macros.RuntimeApiImport(indent=4) USE FIELD_ABORT_MODULE ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) INTEGER (KIND=JPIM), INTENT (IN) :: KDIR LOGICAL, INTENT (IN) :: MAP_DEVPTR INTEGER (KIND=JPIM), OPTIONAL, INTENT (IN) :: QUEUE - INTEGER (KIND=JPIM) :: IHST_PITCH, IDEV_PITCH, IRET + INTEGER (KIND=JPIM) :: IHST_PITCH, IDEV_PITCH INTEGER (KIND=JPIM) :: IWIDTH, IHEIGHT, ISHP(${ft.rank+1}$) #:if d2 < ft.rank INTEGER :: ${', '.join (list (map (lambda i: 'J' + str (i+1), range (d2, ft.rank))))}$ #:endif - INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM + INTEGER(KIND=${offload_macros.StreamHandleKind()}$) :: STREAM TYPE(C_PTR) :: HSTPTR - TYPE(C_DEVPTR) :: DEVPTR +$:offload_macros.CDevptrDecl(symbols=['DEVPTR'], indent=4) +$:offload_macros.RuntimeErrorType(symbols=['IRET',], indent=4) ISHP(1) = 1 ISHP(2:) = SHAPE(HST) @@ -216,42 +216,36 @@ $:offload_macros.CopyFromDevice1D(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE' #:set ar = lambda arr: ', '.join(lbnds(arr, 0, d2) + [f'J{i+1}' for i in range(d2, ft.rank)]) ${indent}$ HSTPTR = C_LOC(HST (${ar('HST')}$)) ${indent}$ IF (MAP_DEVPTR) THEN - ${indent}$ !$acc host_data use_device(DEV) - ${indent}$ DEVPTR = C_DEVLOC(DEV (${ar('DEV')}$)) - ${indent}$ !$acc end host_data +$:offload_macros.HostDataStart(symbols=['DEV',], indent=ft.rank - d2 - 1) + ${indent}$ DEVPTR = ${offload_macros.DevptrCLOC(symbol=f"DEV({ar('DEV')})")}$ +$:offload_macros.HostDataEnd(indent=ft.rank - d2 - 1) ${indent}$ ELSE - ${indent}$ !$acc data deviceptr(DEVPTR,DEV) - ${indent}$ DEVPTR = C_DEVLOC(DEV (${ar('DEV')}$)) - ${indent}$ !$acc end data +$:offload_macros.DataStart(deviceptr=['DEVPTR', 'DEV'], indent=ft.rank - d2 - 1) + ${indent}$ DEVPTR = ${offload_macros.DevptrCLOC(symbol=f"DEV({ar('DEV')})")}$ +$:offload_macros.DataEnd(indent=ft.rank - d2 - 1) ${indent}$ ENDIF ${indent}$ IF (KDIR == NH2D) THEN ${indent}$ IF(PRESENT(QUEUE)) THEN - ${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM) - ${indent}$ IRET = CUDAMEMCPY2DASYNC (DEVPTR, IDEV_PITCH, & - ${indent}$ & HSTPTR, IHST_PITCH, & - ${indent}$ & IWIDTH, IHEIGHT, & - ${indent}$ & STREAM=STREAM) +$:offload_macros.SetAsyncStream(id='QUEUE', stream='STREAM', indent=ft.rank - d2 + 9) +$:offload_macros.Copy2DAsync(dst='DEVPTR', dst_pitch='IDEV_PITCH', src='HSTPTR', src_pitch='IHST_PITCH', & +& width='IWIDTH', height='IHEIGHT', stream='STREAM', return_val='IRET', indent=ft.rank - d2 + 9) ${indent}$ ELSE - ${indent}$ IRET = CUDAMEMCPY2D (DEVPTR, IDEV_PITCH, & - ${indent}$ & HSTPTR, IHST_PITCH, & - ${indent}$ & IWIDTH, IHEIGHT) +$:offload_macros.Copy2D(dst='DEVPTR', dst_pitch='IDEV_PITCH', src='HSTPTR', src_pitch='IHST_PITCH', & +& width='IWIDTH', height='IHEIGHT', return_val='IRET', indent=ft.rank - d2 + 9) ${indent}$ ENDIF - ${indent}$ IF (IRET /= CUDASUCCESS) THEN + ${indent}$ IF (IRET /= 0) THEN ${indent}$ CALL FIELD_ABORT ("${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS: HOST-TO-DEVICE TRANSFER FAILED") ${indent}$ ENDIF ${indent}$ ELSEIF (KDIR == ND2H) THEN ${indent}$ IF(PRESENT(QUEUE)) THEN - ${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM) - ${indent}$ IRET = CUDAMEMCPY2DASYNC (HSTPTR, IHST_PITCH, & - ${indent}$ & DEVPTR, IDEV_PITCH, & - ${indent}$ & IWIDTH, IHEIGHT, & - ${indent}$ & STREAM=STREAM) +$:offload_macros.SetAsyncStream(id='QUEUE', stream='STREAM', indent=ft.rank - d2 + 9) +$:offload_macros.Copy2DAsync(dst='HSTPTR', dst_pitch='IHST_PITCH', src='DEVPTR', src_pitch='IDEV_PITCH', & +& width='IWIDTH', height='IHEIGHT', stream='STREAM', return_val='IRET', indent=ft.rank - d2 + 9) ${indent}$ ELSE - ${indent}$ IRET = CUDAMEMCPY2D (HSTPTR, IHST_PITCH, & - ${indent}$ & DEVPTR, IDEV_PITCH, & - ${indent}$ & IWIDTH, IHEIGHT) +$:offload_macros.Copy2D(dst='HSTPTR', dst_pitch='IHST_PITCH', src='DEVPTR', src_pitch='IDEV_PITCH', & +& width='IWIDTH', height='IHEIGHT', return_val='IRET', indent=ft.rank - d2 + 9) ${indent}$ ENDIF - ${indent}$ IF (IRET /= CUDASUCCESS) THEN + ${indent}$ IF (IRET /= 0) THEN ${indent}$ CALL FIELD_ABORT ("${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS: DEVICE-TO-HOST TRANSFER FAILED") ${indent}$ ENDIF ${indent}$ ENDIF diff --git a/src/core/host_alloc_module.fypp b/src/core/host_alloc_module.fypp index 13fee5d..2ed2cc8 100644 --- a/src/core/host_alloc_module.fypp +++ b/src/core/host_alloc_module.fypp @@ -66,16 +66,8 @@ END INTERFACE #:if defined('CUDA') INTERFACE - INTEGER FUNCTION CUDA_HOST_REGISTER (PTR, SIZ, FLAGS) BIND (C, NAME='cudaHostRegister') - IMPORT :: C_PTR, C_SIZE_T, C_INT - TYPE (C_PTR), VALUE, INTENT(IN) :: PTR - INTEGER (C_SIZE_T), VALUE, INTENT(IN) :: SIZ - INTEGER (C_INT), VALUE, INTENT(IN) :: FLAGS - END FUNCTION CUDA_HOST_REGISTER - INTEGER FUNCTION CUDA_HOST_UNREGISTER (PTR) BIND (C, NAME='cudaHostUnregister') - IMPORT :: C_PTR - TYPE (C_PTR), VALUE, INTENT(IN) :: PTR - END FUNCTION CUDA_HOST_UNREGISTER +$:offload_macros.HostRegisterIntf(indent=3) +$:offload_macros.HostUnregisterIntf(indent=3) END INTERFACE #:endif @@ -122,16 +114,12 @@ CONTAINS SUBROUTINE MEM_BLOCK_INIT( SELF ) CLASS(MEM_BLOCK) :: SELF - INTEGER :: ISTAT CALL C_MALLOC(SELF%SIZE, SELF%DATA) #:if defined('CUDA') IF(INIT_PINNED_VALUE)THEN - CALL PIN_ALLOCATION(SELF%DATA, SELF%SIZE, ISTAT) - IF (ISTAT /= 0) THEN - CALL FIELD_ABORT ("MEM_POOL: FAILED TO REGISTER IN PAGE-LOCKED MEMORY") - ENDIF + CALL PIN_ALLOCATION(SELF%DATA, SELF%SIZE) ENDIF #:endif @@ -159,7 +147,6 @@ END SUBROUTINE MEM_BLOCK_DEALLOC SUBROUTINE MEM_BLOCK_FINAL( SELF ) CLASS(MEM_BLOCK) :: SELF - INTEGER :: ISTAT IF( .NOT. SELF%NUMFLDS == 0 )THEN PRINT *, "FIELD_API DETECTED UNFINALISED FIELDS, POTENTIAL DEVICE MEMORY LEAK" @@ -167,10 +154,7 @@ SUBROUTINE MEM_BLOCK_FINAL( SELF ) #:if defined('CUDA') IF (INIT_PINNED_VALUE) THEN - CALL UNPIN_ALLOCATION(SELF%DATA, ISTAT) - IF (ISTAT /= 0) THEN - CALL FIELD_ABORT ("MEM_POOL: FAILED TO UNREGISTER PAGE-LOCKED MEMORY") - ENDIF + CALL UNPIN_ALLOCATION(SELF%DATA) ENDIF #:endif @@ -265,25 +249,33 @@ SUBROUTINE MEM_POOL_REQUEST_MEM( SELF, ALLOC_SIZE, BLK, DATA, BLKID ) END SUBROUTINE MEM_POOL_REQUEST_MEM #:if defined('CUDA') -SUBROUTINE PIN_ALLOCATION(DATA, ARR_SIZE, ISTAT) +SUBROUTINE PIN_ALLOCATION(DATA, ARR_SIZE) TYPE(C_PTR), INTENT(INOUT) :: DATA - INTEGER, INTENT(OUT) :: ISTAT INTEGER(C_SIZE_T), INTENT(IN) :: ARR_SIZE +$:offload_macros.RuntimeErrorType(symbols=['ISTAT',], indent=3) - INTEGER(C_INT) :: FLAGS +$:offload_macros.RegisterHostDeclFlags(flag_var='FLAGS', indent=3) - FLAGS = 2 !... Corresponds to cudaHostRegisterMapped - ISTAT = CUDA_HOST_REGISTER (DATA, ARR_SIZE, FLAGS) +$:offload_macros.RegisterHostSetFlags(flag_var='FLAGS', val='2', indent=3) +$:offload_macros.RegisterHost(ptr='DATA', size='ARR_SIZE', flags='FLAGS', return_val='ISTAT') + + IF (ISTAT /= 0) THEN + CALL FIELD_ABORT ("FAILED TO REGISTER IN PAGE-LOCKED MEMORY") + ENDIF END SUBROUTINE PIN_ALLOCATION -SUBROUTINE UNPIN_ALLOCATION(DATA, ISTAT) +SUBROUTINE UNPIN_ALLOCATION(DATA) TYPE(C_PTR), INTENT(INOUT) :: DATA - INTEGER, INTENT(OUT) :: ISTAT +$:offload_macros.RuntimeErrorType(symbols=['ISTAT',], indent=3) - ISTAT = CUDA_HOST_UNREGISTER (DATA) +$:offload_macros.UnregisterHost(ptr='DATA', return_val='ISTAT', indent=3) + + IF (ISTAT /= 0) THEN + CALL FIELD_ABORT ("FAILED TO UNREGISTER PAGE-LOCKED MEMORY") + ENDIF END SUBROUTINE UNPIN_ALLOCATION #:endif @@ -330,7 +322,7 @@ SUBROUTINE ${ft.name}$_HOST_ALLOC (HST, LBOUNDS, UBOUNDS, PINNED) ${ft.type}$, POINTER :: PTR(${ft.shape}$) TYPE(C_PTR) :: DATA INTEGER(C_SIZE_T) :: ARR_SIZE - INTEGER :: ISHAPE(${ft.rank}$), ISTAT + INTEGER :: ISHAPE(${ft.rank}$) ARR_SIZE = KIND(HST) #:for r in range(ft.rank) @@ -343,10 +335,7 @@ SUBROUTINE ${ft.name}$_HOST_ALLOC (HST, LBOUNDS, UBOUNDS, PINNED) #:if defined('CUDA') IF(PINNED)THEN - CALL PIN_ALLOCATION(DATA, ARR_SIZE, ISTAT) - IF (ISTAT /= 0) THEN - CALL FIELD_ABORT ("${ft.name}$_OWNER: FAILED TO REGISTER IN PAGE-LOCKED MEMORY") - ENDIF + CALL PIN_ALLOCATION(DATA, ARR_SIZE) ENDIF #:endif @@ -366,7 +355,6 @@ SUBROUTINE ${ft.name}$_HOST_DEALLOC(HST, PINNED) LOGICAL, INTENT (IN) :: PINNED TYPE(C_PTR) :: DATA - INTEGER :: ISTAT IF (FIELD_STATISTICS_ENABLE) CALL FIELD_STATISTICS_HOST_DEALLOCATE (SIZE (HST, KIND=JPIB) * INT (KIND (HST), KIND=JPIB)) @@ -375,10 +363,7 @@ SUBROUTINE ${ft.name}$_HOST_DEALLOC(HST, PINNED) #:if defined('CUDA') IF (PINNED) THEN - CALL UNPIN_ALLOCATION(DATA, ISTAT) - IF (ISTAT /= 0) THEN - CALL FIELD_ABORT ("${ft.name}$_OWNER: FAILED TO UNREGISTER PAGE-LOCKED MEMORY") - ENDIF + CALL UNPIN_ALLOCATION(DATA) ENDIF #:endif From aac718d296b4ab405465b0aeee494a189c7f1875 Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Fri, 31 Jan 2025 22:23:57 +0000 Subject: [PATCH 09/10] Generalise offload related preproc definitions --- CMakeLists.txt | 2 +- cmake/field_api_add_object_library.cmake | 1 + cmake/field_api_get_offload_model.cmake | 5 +++++ src/core/dev_alloc_module.fypp | 12 ++++++------ src/core/field_RANKSUFF_data_module.fypp | 14 +++++--------- src/core/field_RANKSUFF_module.fypp | 13 +++++-------- src/core/host_alloc_module.fypp | 12 ++++++------ 7 files changed, 29 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f15de9d..7a0455d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,7 +77,7 @@ if(HAVE_BUDDY_MALLOC) list( APPEND fypp_defines "-DUSE_BUDDY_MALLOC") endif() if(HAVE_CUDA) - list( APPEND fypp_defines "-DCUDA") + list( APPEND fypp_defines "-DWITH_HIC") endif() if(fiat_FOUND) list( APPEND fypp_defines "-DWITH_FIAT") diff --git a/cmake/field_api_add_object_library.cmake b/cmake/field_api_add_object_library.cmake index c78c500..9de21b4 100644 --- a/cmake/field_api_add_object_library.cmake +++ b/cmake/field_api_add_object_library.cmake @@ -54,6 +54,7 @@ macro(field_api_add_object_library) ${_PAR_DEFINITIONS} $<$:${FIELD_API_DEFINITIONS}> $<${fiat_FOUND}:WITH_FIAT> + ${FIELD_API_OFFLOAD_DEFINITIONS} PRIVATE_LIBS ${_PAR_LIBRARIES} $<${HAVE_ACC}:OpenACC::OpenACC_Fortran> diff --git a/cmake/field_api_get_offload_model.cmake b/cmake/field_api_get_offload_model.cmake index afaca7f..891a814 100644 --- a/cmake/field_api_get_offload_model.cmake +++ b/cmake/field_api_get_offload_model.cmake @@ -54,4 +54,9 @@ macro( field_api_get_offload_model ) set(FIELD_API_OFFLOAD_MODEL "NVHPCOpenACC") endif() + unset(FIELD_API_OFFLOAD_DEFINITIONS) + if( HAVE_ACC ) + list(APPEND FIELD_API_OFFLOAD_DEFINITIONS WITH_GPU_OFFLOAD) + endif() + endmacro() diff --git a/src/core/dev_alloc_module.fypp b/src/core/dev_alloc_module.fypp index 6ed08f7..2331f8a 100644 --- a/src/core/dev_alloc_module.fypp +++ b/src/core/dev_alloc_module.fypp @@ -51,7 +51,7 @@ INTERFACE END INTERFACE #:endif -#:if defined('CUDA') +#:if defined('WITH_HIC') INTERFACE $:offload_macros.DevMallocIntf(indent=2) $:offload_macros.DevFreeIntf(indent=2) @@ -63,7 +63,7 @@ CONTAINS #:for ft in fieldTypeList -#:if defined('USE_BUDDY_MALLOC') or defined('CUDA') +#:if defined('USE_BUDDY_MALLOC') or defined('WITH_HIC') SUBROUTINE ${ft.name}$_DEV_ALLOCATE_HST (DEV, HST, MAP_DEVPTR) @@ -94,7 +94,7 @@ LOGICAL, INTENT(IN) :: MAP_DEVPTR TYPE (C_PTR) :: PTR INTEGER (C_SIZE_T) :: SIZ -#:if defined('CUDA') +#:if defined('WITH_HIC') $:offload_macros.RuntimeErrorType(symbols=['ISTAT',]) #:endif @@ -109,7 +109,7 @@ SIZ = SIZ * INT (UBOUNDS(${i}$)-ILBOUNDS(${i}$)+1, C_SIZE_T) IF(MAP_DEVPTR)THEN CALL DEV_MALLOC (SIZ, PTR) ELSE -#:if defined('CUDA') +#:if defined('WITH_HIC') $:offload_macros.DevMalloc(ptr='PTR', size='SIZ', return_val='ISTAT') #:endif ENDIF @@ -132,7 +132,7 @@ USE FIELD_STATISTICS_MODULE ${ft.type}$, POINTER :: DEV(${ft.shape}$) LOGICAL, INTENT(IN) :: MAP_DEVPTR TYPE (C_PTR) :: PTR -#:if defined('CUDA') +#:if defined('WITH_HIC') $:offload_macros.RuntimeErrorType(symbols=['ISTAT',]) #:endif @@ -146,7 +146,7 @@ IF (ASSOCIATED (DEV)) THEN CALL DEV_FREE (PTR) $:offload_macros.HostMappedDevFree(data=['DEV',], indent=4) ELSE -#:if defined('CUDA') +#:if defined('WITH_HIC') $:offload_macros.DevFree(ptr='PTR', return_val='ISTAT') #:endif ENDIF diff --git a/src/core/field_RANKSUFF_data_module.fypp b/src/core/field_RANKSUFF_data_module.fypp index 7dc0fda..9616664 100644 --- a/src/core/field_RANKSUFF_data_module.fypp +++ b/src/core/field_RANKSUFF_data_module.fypp @@ -66,7 +66,7 @@ CONTAINS ENDIF SELECT CASE (LAST_CONTIG_DIM) -#:if defined('CUDA') +#:if defined('WITH_HIC') CASE (${ft.rank}$) FUNC => ${ftn}$_COPY_DIM${ft.rank}$_CONTIGUOUS #:for d1 in range (ft.rank) @@ -111,9 +111,7 @@ CONTAINS #:for d in range (0, ft.rank+1) SUBROUTINE ${ftn}$_COPY_DIM${d}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) -#ifdef _OPENACC $:offload_macros.RuntimeApiImport(indent=2) -#endif USE, INTRINSIC :: ISO_FORTRAN_ENV, ONLY : INT64 ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) LOGICAL, INTENT (IN) :: MAP_DEVPTR @@ -121,9 +119,7 @@ $:offload_macros.RuntimeApiImport(indent=2) INTEGER (KIND=JPIM), OPTIONAL, INTENT (IN) :: QUEUE INTEGER (KIND=INT64) :: ISIZE INTEGER :: ${', '.join (['J'] + list (map (lambda i: 'J' + str (i+1), range (d, ft.rank))))}$ -#ifdef _OPENACC $:offload_macros.CDevptrDecl(symbols=['DEVPTR'], indent=4) -#endif #:for e in range (ft.rank, d, -1) ${' ' * (ft.rank - e)}$DO J${e}$ = LBOUND (HST, ${e}$), UBOUND (HST, ${e}$) @@ -132,7 +128,7 @@ $:offload_macros.CDevptrDecl(symbols=['DEVPTR'], indent=4) #:set lbdiff = lambda i: f'LBOUND(DEV,{i}) - LBOUND (HST,{i})' #:set ard = ', '.join ([':'] * d + ['J' + str(i+1) + ' + ' + lbdiff(i+1) for i in range (d, ft.rank)]) #:set indent = ' ' * (ft.rank - e) -#ifdef _OPENACC +#ifdef WITH_GPU_OFFLOAD ${indent}$ IF(MAP_DEVPTR)THEN $:offload_macros.HostDataStart(symbols=[f'DEV ({ard})'], indent=ft.rank - e) ${indent}$ DEVPTR = ${offload_macros.DevptrCLOC(f'DEV ({ard})')}$ @@ -149,7 +145,7 @@ $:offload_macros.DataEnd(indent=ft.rank - e) ${indent}$ ISIZE = SIZEOF ( HST(${ar}$) ) #:endif ${indent}$ IF (KDIR == NH2D) THEN -#ifdef _OPENACC +#ifdef WITH_GPU_OFFLOAD ${indent}$ IF(PRESENT(QUEUE))THEN $:offload_macros.CopyToDevice1DAsync(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', queue='QUEUE', indent=ft.rank - e + 10) ${indent}$ ELSE @@ -159,7 +155,7 @@ $:offload_macros.CopyToDevice1D(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', ${indent}$ DEV (${ard}$) = HST (${ar}$) #endif ${indent}$ ELSEIF (KDIR == ND2H) THEN -#ifdef _OPENACC +#ifdef WITH_GPU_OFFLOAD ${indent}$ IF(PRESENT(QUEUE))THEN $:offload_macros.CopyFromDevice1DAsync(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', queue='QUEUE', indent=ft.rank - e + 10) ${indent}$ ELSE @@ -176,7 +172,7 @@ $:offload_macros.CopyFromDevice1D(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE' #:endfor -#:if defined('CUDA') +#:if defined('WITH_HIC') #:for d1 in range (0, ft.rank) #:for d2 in range (d1+1, ft.rank+1) SUBROUTINE ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) diff --git a/src/core/field_RANKSUFF_module.fypp b/src/core/field_RANKSUFF_module.fypp index c3308ba..884e5f0 100644 --- a/src/core/field_RANKSUFF_module.fypp +++ b/src/core/field_RANKSUFF_module.fypp @@ -19,9 +19,6 @@ USE HOST_ALLOC_MODULE USE FIELD_BASIC_MODULE USE FIELD_CONSTANTS_MODULE USE FIELD_DEFAULTS_MODULE -#ifdef _OPENACC -USE OPENACC -#endif ${fieldType.useParkind1 ()}$ #:for ft in fieldTypeList @@ -187,7 +184,7 @@ CONTAINS IF(PRESENT(MAP_DEVPTR))THEN SELF%MAP_DEVPTR = MAP_DEVPTR ENDIF -#:if not defined('CUDA') +#:if not defined('WITH_HIC') IF(.NOT. SELF%MAP_DEVPTR)THEN CALL FIELD_ABORT ("${ftn}$_WRAPPER_INIT: CUDA backend needed to disable host-mapped device-pointer") ENDIF @@ -235,7 +232,7 @@ CONTAINS IF (PRESENT (PERSISTENT)) LLPERSISTENT = PERSISTENT SELF%POOLED = POOL_OWNED_FIELDS -#:if defined('CUDA') +#:if defined('WITH_HIC') SELF%PINNED = INIT_PINNED_VALUE #:endif IF(PRESENT(PINNED))THEN @@ -249,7 +246,7 @@ CONTAINS IF(PRESENT(MAP_DEVPTR))THEN SELF%MAP_DEVPTR = MAP_DEVPTR ENDIF -#:if not defined('CUDA') +#:if not defined('WITH_HIC') IF(.NOT. SELF%MAP_DEVPTR)THEN CALL FIELD_ABORT ("${ftn}$_OWNER_INIT: CUDA backend needed to disable host-mapped device-pointer") ENDIF @@ -390,7 +387,7 @@ CONTAINS IF (.NOT. SELF%LOBJECT_COPIED) THEN SELF%LOBJECT_COPIED = .TRUE. -#ifdef _OPENACC +#ifdef WITH_GPU_OFFLOAD IF (ASSOCIATED (SELF%DEVPTR)) THEN $:offload_macros.AttachDevPtr(ptr='SELF%DEVPTR', indent=6) ENDIF @@ -414,7 +411,7 @@ $:offload_macros.AttachDevPtr(ptr='SELF%DEVPTR', indent=6) IF (SELF%LOBJECT_COPIED) THEN SELF%LOBJECT_COPIED = .FALSE. -#ifdef _OPENACC +#ifdef WITH_GPU_OFFLOAD IF (ASSOCIATED (SELF%DEVPTR)) THEN $:offload_macros.DetachDevPtr(ptr='SELF%DEVPTR', indent=6) ENDIF diff --git a/src/core/host_alloc_module.fypp b/src/core/host_alloc_module.fypp index 2ed2cc8..ef97ed7 100644 --- a/src/core/host_alloc_module.fypp +++ b/src/core/host_alloc_module.fypp @@ -64,7 +64,7 @@ INTERFACE END SUBROUTINE C_PTR_INCR END INTERFACE -#:if defined('CUDA') +#:if defined('WITH_HIC') INTERFACE $:offload_macros.HostRegisterIntf(indent=3) $:offload_macros.HostUnregisterIntf(indent=3) @@ -117,7 +117,7 @@ SUBROUTINE MEM_BLOCK_INIT( SELF ) CALL C_MALLOC(SELF%SIZE, SELF%DATA) -#:if defined('CUDA') +#:if defined('WITH_HIC') IF(INIT_PINNED_VALUE)THEN CALL PIN_ALLOCATION(SELF%DATA, SELF%SIZE) ENDIF @@ -152,7 +152,7 @@ SUBROUTINE MEM_BLOCK_FINAL( SELF ) PRINT *, "FIELD_API DETECTED UNFINALISED FIELDS, POTENTIAL DEVICE MEMORY LEAK" ENDIF -#:if defined('CUDA') +#:if defined('WITH_HIC') IF (INIT_PINNED_VALUE) THEN CALL UNPIN_ALLOCATION(SELF%DATA) ENDIF @@ -248,7 +248,7 @@ SUBROUTINE MEM_POOL_REQUEST_MEM( SELF, ALLOC_SIZE, BLK, DATA, BLKID ) END SUBROUTINE MEM_POOL_REQUEST_MEM -#:if defined('CUDA') +#:if defined('WITH_HIC') SUBROUTINE PIN_ALLOCATION(DATA, ARR_SIZE) TYPE(C_PTR), INTENT(INOUT) :: DATA @@ -333,7 +333,7 @@ SUBROUTINE ${ft.name}$_HOST_ALLOC (HST, LBOUNDS, UBOUNDS, PINNED) IF(ARR_SIZE > 0)THEN CALL C_MALLOC(ARR_SIZE, DATA) -#:if defined('CUDA') +#:if defined('WITH_HIC') IF(PINNED)THEN CALL PIN_ALLOCATION(DATA, ARR_SIZE) ENDIF @@ -361,7 +361,7 @@ SUBROUTINE ${ft.name}$_HOST_DEALLOC(HST, PINNED) IF(SIZE(HST) > 0)THEN DATA = C_LOC (HST (${ ', '.join (map (lambda i: 'LBOUND (HST, ' + str (i) + ')', range (1, ft.rank+1))) }$)) -#:if defined('CUDA') +#:if defined('WITH_HIC') IF (PINNED) THEN CALL UNPIN_ALLOCATION(DATA) ENDIF From f5dc2800ebdceb103b965082720a582e4a908d8c Mon Sep 17 00:00:00 2001 From: Ahmad Nawab Date: Tue, 4 Feb 2025 20:59:20 +0000 Subject: [PATCH 10/10] Use nvhpc naming convention for offload macros --- .../offload_backends/nvhpc/openacc.py | 40 +++--- .../offload_backends/nvhpc/openacc_cuda.py | 10 +- python_utils/offload_macros.py | 130 +++++++++--------- src/core/dev_alloc_module.fypp | 20 +-- src/core/field_RANKSUFF_data_module.fypp | 64 ++++----- src/core/field_RANKSUFF_module.fypp | 8 +- src/core/field_async_module.fypp | 2 +- src/core/host_alloc_module.fypp | 16 +-- .../field_RANKSUFF_shuffle_module.fypp | 8 +- src/util/field_RANKSUFF_access_module.fypp | 2 +- .../field_RANKSUFF_array_util_module.fypp | 10 +- src/util/field_RANKSUFF_util_module.fypp | 4 +- 12 files changed, 157 insertions(+), 157 deletions(-) diff --git a/python_utils/offload_backends/nvhpc/openacc.py b/python_utils/offload_backends/nvhpc/openacc.py index 8830075..abcc205 100644 --- a/python_utils/offload_backends/nvhpc/openacc.py +++ b/python_utils/offload_backends/nvhpc/openacc.py @@ -30,7 +30,7 @@ def runtime_api_import(cls): return "USE OPENACC" @classmethod - def c_devptr_declaration(cls, symbols): + def c_devptr_decl(cls, symbols): """ Type declaration for a `C_PTR` on device. """ @@ -38,7 +38,7 @@ def c_devptr_declaration(cls, symbols): return f"TYPE(C_DEVPTR) :: {','.join(symbols)}" @classmethod - def host_data_start(cls, symbols): + def host_data(cls, symbols): """ Pragma to mark the start of a `host_data` region. """ @@ -46,7 +46,7 @@ def host_data_start(cls, symbols): return f"!$acc host_data use_device({','.join(symbols)})" @classmethod - def host_data_end(cls): + def end_host_data(cls): """ Pragma to mark the end of a `host_data` region. """ @@ -62,7 +62,7 @@ def devptr_c_loc(cls, symbol): return f"C_DEVLOC({symbol})" @classmethod - def copy_to_device_1D(cls, dev, host, size): + def memcpy_to_device(cls, dev, host, size): """ Copy a contiguous section of data from host to device. """ @@ -70,7 +70,7 @@ def copy_to_device_1D(cls, dev, host, size): return f"CALL ACC_MEMCPY_TO_DEVICE ({dev}, {host}, {size})" @classmethod - def copy_to_device_1D_async(cls, dev, host, queue, size): + def memcpy_to_device_async(cls, dev, host, queue, size): """ Asynchornously copy a contiguous section of data from host to device. """ @@ -78,7 +78,7 @@ def copy_to_device_1D_async(cls, dev, host, queue, size): return f"CALL ACC_MEMCPY_TO_DEVICE_ASYNC ({dev}, {host}, {size}, {queue})" @classmethod - def copy_from_device_1D(cls, dev, host, size): + def memcpy_from_device(cls, dev, host, size): """ Copy a contiguous section of data from device to host. """ @@ -86,7 +86,7 @@ def copy_from_device_1D(cls, dev, host, size): return f"CALL ACC_MEMCPY_FROM_DEVICE ({host}, {dev}, {size})" @classmethod - def copy_from_device_1D_async(cls, dev, host, size, queue): + def memcpy_from_device_async(cls, dev, host, size, queue): """ Asynchronously copy a contiguous section of data from device to host. """ @@ -94,23 +94,23 @@ def copy_from_device_1D_async(cls, dev, host, size, queue): return f"CALL ACC_MEMCPY_FROM_DEVICE_ASYNC ({host}, {dev}, {size}, {queue})" @classmethod - def host_mapped_dev_alloc(cls, data): + def create(cls, symbols): """ Allocate host-mapped memory on device. """ - return f"!$acc enter data create ({','.join(data)})" + return f"!$acc enter data create ({','.join(symbols)})" @classmethod - def host_mapped_dev_free(cls, data): + def delete(cls, symbols): """ Free host-mapped memory on device. """ - return f"!$acc exit data delete ({','.join(data)})" + return f"!$acc exit data delete ({','.join(symbols)})" @classmethod - def attach_dev_ptr(cls, ptr): + def attach(cls, ptr): """ Attach device pointer to its target on device. """ @@ -118,7 +118,7 @@ def attach_dev_ptr(cls, ptr): return f"!$acc enter data attach ({ptr})" @classmethod - def detach_dev_ptr(cls, ptr): + def detach(cls, ptr): """ Detach device pointer from its target on device. """ @@ -126,7 +126,7 @@ def detach_dev_ptr(cls, ptr): return f"!$acc exit data detach ({ptr})" @classmethod - def launch_kernel(cls, **kwargs): + def kernels(cls, **kwargs): """ Launch an implicitly mapped parallel kernel on device. """ @@ -140,7 +140,7 @@ def launch_kernel(cls, **kwargs): return f"!$acc kernels {_data_spec}" @classmethod - def end_kernel(cls): + def end_kernels(cls): """ End an implicitly mapped parallel kernel on device. """ @@ -156,7 +156,7 @@ def async_wait(cls, stream): return f"!$acc wait ({stream})" @classmethod - def launch_parallel_loop(cls, **kwargs): + def parallel_loop(cls, **kwargs): """ Launch an explicitly mapped parallel kernel on device. """ @@ -182,7 +182,7 @@ def end_parallel_loop(cls): return "!$acc end parallel loop" @classmethod - def annotate_parallel_loop(cls, **kwargs): + def annotate_loop(cls, **kwargs): """ Annotate a loop in a device parallel region. """ @@ -214,7 +214,7 @@ def declare(cls, **kwargs): return f"!$acc declare {_decl_spec}" @classmethod - def launch_serial_kernel(cls, **kwargs): + def serial(cls, **kwargs): """ Launch a serial kernel on device. """ @@ -228,7 +228,7 @@ def launch_serial_kernel(cls, **kwargs): return f"!$acc serial {_data_spec}" @classmethod - def end_serial_kernel(cls): + def end_serial(cls): """ End a serial device kernel. """ @@ -252,7 +252,7 @@ def update_host(cls, data): return f"!$acc update self ({','.join(data)})" @classmethod - def data_start(cls, **kwargs): + def data(cls, **kwargs): """ Pragma to mark the start of a `data` region. """ diff --git a/python_utils/offload_backends/nvhpc/openacc_cuda.py b/python_utils/offload_backends/nvhpc/openacc_cuda.py index be6b890..5b369b8 100644 --- a/python_utils/offload_backends/nvhpc/openacc_cuda.py +++ b/python_utils/offload_backends/nvhpc/openacc_cuda.py @@ -69,7 +69,7 @@ def dev_free_intf(cls): return intf.split('\n') @classmethod - def runtime_error_return_type(cls, symbols): + def runtime_error_return_type_decl(cls, symbols): """ Declaration for the variable used to store the runtime API error status. """ @@ -157,15 +157,15 @@ def host_unregister_intf(cls): return intf.split('\n') @classmethod - def set_async_stream(cls, id, stream): + def set_async_stream(cls, queue, stream): """ Set an asynchronous stream. """ - return f"CALL ACC_SET_CUDA_STREAM({id}, {stream})" + return f"CALL ACC_SET_CUDA_STREAM({queue}, {stream})" @classmethod - def copy_2D(cls, src, src_pitch, dst, dst_pitch, width, height, return_val="ISTAT"): + def memcpy_2D(cls, src, src_pitch, dst, dst_pitch, width, height, return_val="ISTAT"): """ Copy a strided memory region from source (src) to destination (dst). """ @@ -173,7 +173,7 @@ def copy_2D(cls, src, src_pitch, dst, dst_pitch, width, height, return_val="ISTA return f"{return_val} = CUDAMEMCPY2D({dst}, {dst_pitch}, {src}, {src_pitch}, {width}, {height})" @classmethod - def copy_2D_async(cls, src, src_pitch, dst, dst_pitch, width, height, stream, return_val="ISTAT"): + def memcpy_2D_async(cls, src, src_pitch, dst, dst_pitch, width, height, stream, return_val="ISTAT"): """ Asynchronously copy a strided memory region from source (src) to destination (dst). """ diff --git a/python_utils/offload_macros.py b/python_utils/offload_macros.py index 31ddccf..0068561 100644 --- a/python_utils/offload_macros.py +++ b/python_utils/offload_macros.py @@ -108,7 +108,7 @@ def _get_method(backend, method): except AttributeError: return _empty_string -def RuntimeApiImport(indent=0): +def runtime_api_import(indent=0): """ Import the runtime API. """ @@ -118,37 +118,37 @@ def RuntimeApiImport(indent=0): return _format_lines(method(), indent=indent) -def CDevptrDecl(symbols, indent=0): +def c_devptr_decl(symbols, indent=0): """ Declare symbols of type `TYPE(C_DEVPTR)` (or equivalent). """ backend = _get_offload_backend() - method = _get_method(backend, 'c_devptr_declaration') + method = _get_method(backend, 'c_devptr_decl') return _format_lines(method(symbols), indent=indent) -def HostDataStart(symbols, indent=0): +def host_data(use_device, indent=0): """ Start a `host_data` (or equivalent) region. """ backend = _get_offload_backend() - method = _get_method(backend, 'host_data_start') + method = _get_method(backend, 'host_data') - return _format_lines(method(symbols), indent=indent, pragma=backend.pragma) + return _format_lines(method(use_device), indent=indent, pragma=backend.pragma) -def HostDataEnd(indent=0): +def end_host_data(indent=0): """ End a `host_data` (or equivalent) region. """ backend = _get_offload_backend() - method = _get_method(backend, 'host_data_end') + method = _get_method(backend, 'end_host_data') return _format_lines(method(), indent=indent) -def DevptrCLOC(symbol, indent=0): +def devptr_cloc(symbol, indent=0): """ Get the C address of a device variable. """ @@ -158,108 +158,108 @@ def DevptrCLOC(symbol, indent=0): return _format_lines(method(symbol)) -def CopyToDevice1D(dev, host, size, indent=0): +def memcpy_to_device(dev, host, size, indent=0): """ Copy a contiguous section of data from host to device. """ backend = _get_offload_backend() - method = _get_method(backend, 'copy_to_device_1D') + method = _get_method(backend, 'memcpy_to_device') return _format_lines(method(dev, host, size), indent=indent) -def CopyToDevice1DAsync(dev, host, size, queue, indent=0): +def memcpy_to_device_async(dev, host, size, queue, indent=0): """ Asynchronously copy a contiguous section of data from host to device. """ backend = _get_offload_backend() - method = _get_method(backend, 'copy_to_device_1D_async') + method = _get_method(backend, 'memcpy_to_device_async') return _format_lines(method(dev, host, size, queue), indent=indent) -def CopyFromDevice1D(dev, host, size, indent=0): +def memcpy_from_device(dev, host, size, indent=0): """ Copy a contiguous section of data from device to host. """ backend = _get_offload_backend() - method = _get_method(backend, 'copy_from_device_1D') + method = _get_method(backend, 'memcpy_from_device') return _format_lines(method(dev, host, size), indent=indent) -def CopyFromDevice1DAsync(dev, host, size, queue, indent=0): +def memcpy_from_device_async(dev, host, size, queue, indent=0): """ Asynchronously copy a contiguous section of data from device to host. """ backend = _get_offload_backend() - method = _get_method(backend, 'copy_from_device_1D_async') + method = _get_method(backend, 'memcpy_from_device_async') return _format_lines(method(dev, host, size, queue), indent=indent) -def HostMappedDevAlloc(data, indent=0): +def create(symbols, indent=0): """ Allocate host-mapped memory on device. """ backend = _get_offload_backend() - method = _get_method(backend, 'host_mapped_dev_alloc') + method = _get_method(backend, 'create') - return _format_lines(method(data), indent=indent) + return _format_lines(method(symbols), indent=indent) -def HostMappedDevFree(data, indent=0): +def delete(symbols, indent=0): """ Free host-mapped memory on device. """ backend = _get_offload_backend() - method = _get_method(backend, 'host_mapped_dev_free') + method = _get_method(backend, 'delete') - return _format_lines(method(data), indent=indent) + return _format_lines(method(symbols), indent=indent) -def AttachDevPtr(ptr, indent=0): +def attach(ptr, indent=0): """ Attach a device pointer to its target. """ backend = _get_offload_backend() - method = _get_method(backend, 'attach_dev_ptr') + method = _get_method(backend, 'attach') return _format_lines(method(ptr), indent=indent) -def DetachDevPtr(ptr, indent=0): +def detach(ptr, indent=0): """ Detach a device pointer from its target. """ backend = _get_offload_backend() - method = _get_method(backend, 'detach_dev_ptr') + method = _get_method(backend, 'detach') return _format_lines(method(ptr), indent=indent) -def LaunchParallelKernel(**kwargs): +def kernels(**kwargs): """ Launch an implicitly mapped parallel kernel on device. """ backend = _get_offload_backend() - method = _get_method(backend, 'launch_kernel') + method = _get_method(backend, 'kernels') indent = kwargs.pop('indent', 0) return _format_lines(method(**kwargs), indent=indent) -def EndParallelKernel(indent=0): +def end_kernels(indent=0): """ End an implicitly mapped parallel kernel on device. """ backend = _get_offload_backend() - method = _get_method(backend, 'end_kernel') + method = _get_method(backend, 'end_kernels') return _format_lines(method(), indent=indent) -def WaitAsyncStream(stream, indent=0): +def async_wait(stream, indent=0): """ Wait for the operations queued on a stream to complete. """ @@ -269,18 +269,18 @@ def WaitAsyncStream(stream, indent=0): return _format_lines(method(stream), indent=indent) -def LaunchParallelLoop(**kwargs): +def parallel_loop(**kwargs): """ Launch an explicitly mapped parallel kernel on device. """ backend = _get_offload_backend() - method = _get_method(backend, 'launch_parallel_loop') + method = _get_method(backend, 'parallel_loop') indent = kwargs.pop('indent', 0) return _format_lines(method(**kwargs), indent=indent) -def EndParallelLoop(indent=0): +def end_parallel_loop(indent=0): """ End an explicitly mapped parallel kernel on device. """ @@ -290,18 +290,18 @@ def EndParallelLoop(indent=0): return _format_lines(method(), indent=indent) -def AnnotateParallelLoop(**kwargs): +def annotate_loop(**kwargs): """ Annotate a loop in a device parallel region. """ backend = _get_offload_backend() - method = _get_method(backend, 'annotate_parallel_loop') + method = _get_method(backend, 'annotate_loop') indent = kwargs.pop('indent', 0) return _format_lines(method(**kwargs), indent=indent) -def Declare(**kwargs): +def declare(**kwargs): """ Issue a device declaration for a host-mapped symbol. """ @@ -312,7 +312,7 @@ def Declare(**kwargs): indent = kwargs.pop('indent', 0) return _format_lines(method(**kwargs), indent=indent) -def LaunchSerialKernel(**kwargs): +def serial(**kwargs): """ Launch a serial kernel on device. """ @@ -323,7 +323,7 @@ def LaunchSerialKernel(**kwargs): indent = kwargs.pop('indent', 0) return _format_lines(method(**kwargs), indent=indent) -def EndSerialKernel(indent=0): +def end_serial(indent=0): """ End a serial device kernel. """ @@ -333,7 +333,7 @@ def EndSerialKernel(indent=0): return _format_lines(method(), indent=indent) -def UpdateDevice(data, indent=0): +def update_device(symbols, indent=0): """ Update host-mapped symbol on device. """ @@ -341,9 +341,9 @@ def UpdateDevice(data, indent=0): backend = _get_offload_backend() method = _get_method(backend, 'update_device') - return _format_lines(method(data), indent=indent) + return _format_lines(method(symbols), indent=indent) -def UpdateHost(data, indent=0): +def update_host(symbols, indent=0): """ Update device-mapped symbol on host. """ @@ -351,9 +351,9 @@ def UpdateHost(data, indent=0): backend = _get_offload_backend() method = _get_method(backend, 'update_host') - return _format_lines(method(data), indent=indent) + return _format_lines(method(symbols), indent=indent) -def StreamHandleKind(): +def stream_handle_kind(): """ Return the INTEGER kind specifier for a stream handle. """ @@ -363,18 +363,18 @@ def StreamHandleKind(): return _format_lines(method()) -def DataStart(**kwargs): +def data(**kwargs): """ Start a `data` (or equivalent) region. """ backend = _get_offload_backend() - method = _get_method(backend, 'data_start') + method = _get_method(backend, 'data') indent = kwargs.pop('indent', 0) return _format_lines(method(**kwargs), indent=indent, pragma=backend.pragma) -def DataEnd(indent=0): +def end_data(indent=0): """ End a `data` (or equivalent) region. """ @@ -384,7 +384,7 @@ def DataEnd(indent=0): return _format_lines(method(), indent=indent) -def DevMallocIntf(indent=0): +def dev_malloc_intf(indent=0): """ The ISO_C interface for a device memory allocation. """ @@ -394,7 +394,7 @@ def DevMallocIntf(indent=0): return _format_lines(method(), indent=indent) -def DevFreeIntf(indent=0): +def dev_free_intf(indent=0): """ The ISO_C interface for freeing device memory. """ @@ -404,17 +404,17 @@ def DevFreeIntf(indent=0): return _format_lines(method(), indent=indent) -def RuntimeErrorType(symbols, indent=0): +def runtime_error_decl(symbols, indent=0): """ Declaration for the variable used to store the runtime API error status. """ backend = _get_offload_backend() - method = _get_method(backend, 'runtime_error_return_type') + method = _get_method(backend, 'runtime_error_return_type_decl') return _format_lines(method(symbols), indent=indent) -def DevMalloc(ptr, size, return_val="ISTAT", indent=0): +def dev_malloc(ptr, size, return_val="ISTAT", indent=0): """ Allocate memory on device. """ @@ -424,7 +424,7 @@ def DevMalloc(ptr, size, return_val="ISTAT", indent=0): return _format_lines(method(ptr, size, return_val=return_val), indent=indent) -def DevFree(ptr, return_val="ISTAT", indent=0): +def dev_free(ptr, return_val="ISTAT", indent=0): """ Free device memory. """ @@ -434,7 +434,7 @@ def DevFree(ptr, return_val="ISTAT", indent=0): return _format_lines(method(ptr, return_val=return_val), indent=indent) -def RegisterHostSetFlags(flag_var, val, indent=0): +def register_host_set_flags(flag_var, val, indent=0): """ Set flags for page-locking host memory. """ @@ -444,7 +444,7 @@ def RegisterHostSetFlags(flag_var, val, indent=0): return _format_lines(method(flag_var, val), indent=indent) -def RegisterHostDeclFlags(flag_var, indent=0): +def register_host_decl_flags(flag_var, indent=0): """ Declare variable used to store flags for controlling page-locking of host memory. """ @@ -454,7 +454,7 @@ def RegisterHostDeclFlags(flag_var, indent=0): return _format_lines(method(flag_var), indent=indent) -def RegisterHost(ptr, size, flags, return_val="ISTAT", indent=0): +def register_host(ptr, size, flags, return_val="ISTAT", indent=0): """ Page-lock host memory. """ @@ -464,7 +464,7 @@ def RegisterHost(ptr, size, flags, return_val="ISTAT", indent=0): return _format_lines(method(ptr, size, flags, return_val=return_val), indent=indent) -def UnregisterHost(ptr, return_val="ISTAT", indent=0): +def unregister_host(ptr, return_val="ISTAT", indent=0): """ Unpin (i.e. undo page-locking) host memory. """ @@ -474,7 +474,7 @@ def UnregisterHost(ptr, return_val="ISTAT", indent=0): return _format_lines(method(ptr, return_val=return_val), indent=indent) -def HostRegisterIntf(indent=0): +def host_register_intf(indent=0): """ The ISO_C interface for page-locking host memory. """ @@ -484,7 +484,7 @@ def HostRegisterIntf(indent=0): return _format_lines(method(), indent=indent) -def HostUnregisterIntf(indent=0): +def host_unregister_intf(indent=0): """ The ISO_C interface for un-pinning (i.e. undo page-locking) host memory. """ @@ -494,7 +494,7 @@ def HostUnregisterIntf(indent=0): return _format_lines(method(), indent=indent) -def SetAsyncStream(id, stream, indent=0): +def set_async_stream(queue, stream, indent=0): """ Set an asynchronous stream. """ @@ -502,20 +502,20 @@ def SetAsyncStream(id, stream, indent=0): backend = _get_offload_backend() method = _get_method(backend, 'set_async_stream') - return _format_lines(method(id, stream), indent=indent) + return _format_lines(method(queue, stream), indent=indent) -def Copy2D(src, src_pitch, dst, dst_pitch, width, height, return_val="ISTAT", indent=0): +def memcpy_2D(src, src_pitch, dst, dst_pitch, width, height, return_val="ISTAT", indent=0): """ Copy a strided memory region from source (src) to destination (dst). """ backend = _get_offload_backend() - method = _get_method(backend, 'copy_2D') + method = _get_method(backend, 'memcpy_2D') return _format_lines(method(src, src_pitch, dst, dst_pitch, width, height, return_val=return_val), indent=indent) -def Copy2DAsync(src, src_pitch, dst, dst_pitch, width, height, stream, return_val="ISTAT", indent=0): +def memcpy_2D_async(src, src_pitch, dst, dst_pitch, width, height, stream, return_val="ISTAT", indent=0): """ Asynchronously copy a strided memory region from source (src) to destination (dst). """ diff --git a/src/core/dev_alloc_module.fypp b/src/core/dev_alloc_module.fypp index 2331f8a..b658fd4 100644 --- a/src/core/dev_alloc_module.fypp +++ b/src/core/dev_alloc_module.fypp @@ -53,8 +53,8 @@ END INTERFACE #:if defined('WITH_HIC') INTERFACE -$:offload_macros.DevMallocIntf(indent=2) -$:offload_macros.DevFreeIntf(indent=2) +$:offload_macros.dev_malloc_intf(indent=2) +$:offload_macros.dev_free_intf(indent=2) END INTERFACE #:endif @@ -95,7 +95,7 @@ LOGICAL, INTENT(IN) :: MAP_DEVPTR TYPE (C_PTR) :: PTR INTEGER (C_SIZE_T) :: SIZ #:if defined('WITH_HIC') -$:offload_macros.RuntimeErrorType(symbols=['ISTAT',]) +$:offload_macros.runtime_error_decl(symbols=['ISTAT',]) #:endif ILBOUNDS = 1 @@ -110,7 +110,7 @@ IF(MAP_DEVPTR)THEN CALL DEV_MALLOC (SIZ, PTR) ELSE #:if defined('WITH_HIC') -$:offload_macros.DevMalloc(ptr='PTR', size='SIZ', return_val='ISTAT') +$:offload_macros.dev_malloc(ptr='PTR', size='SIZ', return_val='ISTAT') #:endif ENDIF @@ -118,7 +118,7 @@ CALL C_F_POINTER (PTR, TMP, UBOUNDS-ILBOUNDS+1) DEV (${ ', '.join (map (lambda i: 'ILBOUNDS (' + str (i) + '):', range (1, ft.rank+1))) }$) => TMP IF(MAP_DEVPTR)THEN -$:offload_macros.HostMappedDevAlloc(data=['DEV',]) +$:offload_macros.create(symbols=['DEV',]) ENDIF IF (FIELD_STATISTICS_ENABLE) CALL FIELD_STATISTICS_DEVICE_ALLOCATE (SIZE (DEV, KIND=JPIB) * INT (KIND (DEV), KIND=JPIB)) @@ -133,7 +133,7 @@ ${ft.type}$, POINTER :: DEV(${ft.shape}$) LOGICAL, INTENT(IN) :: MAP_DEVPTR TYPE (C_PTR) :: PTR #:if defined('WITH_HIC') -$:offload_macros.RuntimeErrorType(symbols=['ISTAT',]) +$:offload_macros.runtime_error_decl(symbols=['ISTAT',]) #:endif IF (ASSOCIATED (DEV)) THEN @@ -144,10 +144,10 @@ IF (ASSOCIATED (DEV)) THEN IF(MAP_DEVPTR)THEN CALL DEV_FREE (PTR) -$:offload_macros.HostMappedDevFree(data=['DEV',], indent=4) +$:offload_macros.delete(symbols=['DEV',], indent=4) ELSE #:if defined('WITH_HIC') -$:offload_macros.DevFree(ptr='PTR', return_val='ISTAT') +$:offload_macros.dev_free(ptr='PTR', return_val='ISTAT') #:endif ENDIF @@ -174,7 +174,7 @@ ALLOCATE (DEV (${ ', '.join (map (lambda i: 'LBOUND (HST, ' + str (i) + '):UBOUN ALLOCATE (DEV, MOLD=HST) #endif -$:offload_macros.HostMappedDevAlloc(data=['DEV',]) +$:offload_macros.create(symbols=['DEV',]) IF (FIELD_STATISTICS_ENABLE) CALL FIELD_STATISTICS_DEVICE_ALLOCATE (SIZE (DEV, KIND=JPIB) * INT (KIND (DEV), KIND=JPIB)) @@ -191,7 +191,7 @@ IF (ASSOCIATED (DEV)) THEN IF (FIELD_STATISTICS_ENABLE) CALL FIELD_STATISTICS_DEVICE_DEALLOCATE (SIZE (DEV, KIND=JPIB) * INT (KIND (DEV), KIND=JPIB)) -$:offload_macros.HostMappedDevFree(data=['DEV',], indent=2) +$:offload_macros.delete(symbols=['DEV',], indent=2) DEALLOCATE (DEV) NULLIFY (DEV) ENDIF diff --git a/src/core/field_RANKSUFF_data_module.fypp b/src/core/field_RANKSUFF_data_module.fypp index 9616664..a14aa7a 100644 --- a/src/core/field_RANKSUFF_data_module.fypp +++ b/src/core/field_RANKSUFF_data_module.fypp @@ -111,7 +111,7 @@ CONTAINS #:for d in range (0, ft.rank+1) SUBROUTINE ${ftn}$_COPY_DIM${d}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) -$:offload_macros.RuntimeApiImport(indent=2) +$:offload_macros.runtime_api_import(indent=2) USE, INTRINSIC :: ISO_FORTRAN_ENV, ONLY : INT64 ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) LOGICAL, INTENT (IN) :: MAP_DEVPTR @@ -119,7 +119,7 @@ $:offload_macros.RuntimeApiImport(indent=2) INTEGER (KIND=JPIM), OPTIONAL, INTENT (IN) :: QUEUE INTEGER (KIND=INT64) :: ISIZE INTEGER :: ${', '.join (['J'] + list (map (lambda i: 'J' + str (i+1), range (d, ft.rank))))}$ -$:offload_macros.CDevptrDecl(symbols=['DEVPTR'], indent=4) +$:offload_macros.c_devptr_decl(symbols=['DEVPTR'], indent=4) #:for e in range (ft.rank, d, -1) ${' ' * (ft.rank - e)}$DO J${e}$ = LBOUND (HST, ${e}$), UBOUND (HST, ${e}$) @@ -130,13 +130,13 @@ $:offload_macros.CDevptrDecl(symbols=['DEVPTR'], indent=4) #:set indent = ' ' * (ft.rank - e) #ifdef WITH_GPU_OFFLOAD ${indent}$ IF(MAP_DEVPTR)THEN -$:offload_macros.HostDataStart(symbols=[f'DEV ({ard})'], indent=ft.rank - e) - ${indent}$ DEVPTR = ${offload_macros.DevptrCLOC(f'DEV ({ard})')}$ -$:offload_macros.HostDataEnd(indent=ft.rank - e) +$:offload_macros.host_data(use_device=[f'DEV ({ard})'], indent=ft.rank - e) + ${indent}$ DEVPTR = ${offload_macros.devptr_cloc(f'DEV ({ard})')}$ +$:offload_macros.end_host_data(indent=ft.rank - e) ${indent}$ ELSE -$:offload_macros.DataStart(deviceptr=['DEVPTR', 'DEV'], indent=ft.rank - e) - ${indent}$ DEVPTR = ${offload_macros.DevptrCLOC(f'DEV ({ard})')}$ -$:offload_macros.DataEnd(indent=ft.rank - e) +$:offload_macros.data(deviceptr=['DEVPTR', 'DEV'], indent=ft.rank - e) + ${indent}$ DEVPTR = ${offload_macros.devptr_cloc(f'DEV ({ard})')}$ +$:offload_macros.end_data(indent=ft.rank - e) ${indent}$ ENDIF #endif #:if d == 0 @@ -147,9 +147,9 @@ $:offload_macros.DataEnd(indent=ft.rank - e) ${indent}$ IF (KDIR == NH2D) THEN #ifdef WITH_GPU_OFFLOAD ${indent}$ IF(PRESENT(QUEUE))THEN -$:offload_macros.CopyToDevice1DAsync(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', queue='QUEUE', indent=ft.rank - e + 10) +$:offload_macros.memcpy_to_device_async(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', queue='QUEUE', indent=ft.rank - e + 10) ${indent}$ ELSE -$:offload_macros.CopyToDevice1D(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', indent=ft.rank - e + 10) +$:offload_macros.memcpy_to_device(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', indent=ft.rank - e + 10) ${indent}$ ENDIF #else ${indent}$ DEV (${ard}$) = HST (${ar}$) @@ -157,9 +157,9 @@ $:offload_macros.CopyToDevice1D(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', ${indent}$ ELSEIF (KDIR == ND2H) THEN #ifdef WITH_GPU_OFFLOAD ${indent}$ IF(PRESENT(QUEUE))THEN -$:offload_macros.CopyFromDevice1DAsync(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', queue='QUEUE', indent=ft.rank - e + 10) +$:offload_macros.memcpy_from_device_async(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', queue='QUEUE', indent=ft.rank - e + 10) ${indent}$ ELSE -$:offload_macros.CopyFromDevice1D(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', indent=ft.rank - e + 10) +$:offload_macros.memcpy_from_device(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE', indent=ft.rank - e + 10) ${indent}$ ENDIF #else ${indent}$ HST (${ar}$) = DEV (${ard}$) @@ -176,7 +176,7 @@ $:offload_macros.CopyFromDevice1D(dev='DEVPTR', host=f'HST ({ar})', size='ISIZE' #:for d1 in range (0, ft.rank) #:for d2 in range (d1+1, ft.rank+1) SUBROUTINE ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE) -$:offload_macros.RuntimeApiImport(indent=4) +$:offload_macros.runtime_api_import(indent=4) USE FIELD_ABORT_MODULE ${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$) @@ -188,10 +188,10 @@ $:offload_macros.RuntimeApiImport(indent=4) #:if d2 < ft.rank INTEGER :: ${', '.join (list (map (lambda i: 'J' + str (i+1), range (d2, ft.rank))))}$ #:endif - INTEGER(KIND=${offload_macros.StreamHandleKind()}$) :: STREAM + INTEGER(KIND=${offload_macros.stream_handle_kind()}$) :: STREAM TYPE(C_PTR) :: HSTPTR -$:offload_macros.CDevptrDecl(symbols=['DEVPTR'], indent=4) -$:offload_macros.RuntimeErrorType(symbols=['IRET',], indent=4) +$:offload_macros.c_devptr_decl(symbols=['DEVPTR'], indent=4) +$:offload_macros.runtime_error_decl(symbols=['IRET',], indent=4) ISHP(1) = 1 ISHP(2:) = SHAPE(HST) @@ -212,34 +212,34 @@ $:offload_macros.RuntimeErrorType(symbols=['IRET',], indent=4) #:set ar = lambda arr: ', '.join(lbnds(arr, 0, d2) + [f'J{i+1}' for i in range(d2, ft.rank)]) ${indent}$ HSTPTR = C_LOC(HST (${ar('HST')}$)) ${indent}$ IF (MAP_DEVPTR) THEN -$:offload_macros.HostDataStart(symbols=['DEV',], indent=ft.rank - d2 - 1) - ${indent}$ DEVPTR = ${offload_macros.DevptrCLOC(symbol=f"DEV({ar('DEV')})")}$ -$:offload_macros.HostDataEnd(indent=ft.rank - d2 - 1) +$:offload_macros.host_data(use_device=['DEV',], indent=ft.rank - d2 - 1) + ${indent}$ DEVPTR = ${offload_macros.devptr_cloc(symbol=f"DEV({ar('DEV')})")}$ +$:offload_macros.end_host_data(indent=ft.rank - d2 - 1) ${indent}$ ELSE -$:offload_macros.DataStart(deviceptr=['DEVPTR', 'DEV'], indent=ft.rank - d2 - 1) - ${indent}$ DEVPTR = ${offload_macros.DevptrCLOC(symbol=f"DEV({ar('DEV')})")}$ -$:offload_macros.DataEnd(indent=ft.rank - d2 - 1) +$:offload_macros.data(deviceptr=['DEVPTR', 'DEV'], indent=ft.rank - d2 - 1) + ${indent}$ DEVPTR = ${offload_macros.devptr_cloc(symbol=f"DEV({ar('DEV')})")}$ +$:offload_macros.end_data(indent=ft.rank - d2 - 1) ${indent}$ ENDIF ${indent}$ IF (KDIR == NH2D) THEN ${indent}$ IF(PRESENT(QUEUE)) THEN -$:offload_macros.SetAsyncStream(id='QUEUE', stream='STREAM', indent=ft.rank - d2 + 9) -$:offload_macros.Copy2DAsync(dst='DEVPTR', dst_pitch='IDEV_PITCH', src='HSTPTR', src_pitch='IHST_PITCH', & -& width='IWIDTH', height='IHEIGHT', stream='STREAM', return_val='IRET', indent=ft.rank - d2 + 9) +$:offload_macros.set_async_stream(queue='QUEUE', stream='STREAM', indent=ft.rank - d2 + 9) +$:offload_macros.memcpy_2D_async(dst='DEVPTR', dst_pitch='IDEV_PITCH', src='HSTPTR', src_pitch='IHST_PITCH', & +& width='IWIDTH', height='IHEIGHT', stream='STREAM', return_val='IRET', indent=ft.rank - d2 + 9) ${indent}$ ELSE -$:offload_macros.Copy2D(dst='DEVPTR', dst_pitch='IDEV_PITCH', src='HSTPTR', src_pitch='IHST_PITCH', & -& width='IWIDTH', height='IHEIGHT', return_val='IRET', indent=ft.rank - d2 + 9) +$:offload_macros.memcpy_2D(dst='DEVPTR', dst_pitch='IDEV_PITCH', src='HSTPTR', src_pitch='IHST_PITCH', & +& width='IWIDTH', height='IHEIGHT', return_val='IRET', indent=ft.rank - d2 + 9) ${indent}$ ENDIF ${indent}$ IF (IRET /= 0) THEN ${indent}$ CALL FIELD_ABORT ("${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS: HOST-TO-DEVICE TRANSFER FAILED") ${indent}$ ENDIF ${indent}$ ELSEIF (KDIR == ND2H) THEN ${indent}$ IF(PRESENT(QUEUE)) THEN -$:offload_macros.SetAsyncStream(id='QUEUE', stream='STREAM', indent=ft.rank - d2 + 9) -$:offload_macros.Copy2DAsync(dst='HSTPTR', dst_pitch='IHST_PITCH', src='DEVPTR', src_pitch='IDEV_PITCH', & -& width='IWIDTH', height='IHEIGHT', stream='STREAM', return_val='IRET', indent=ft.rank - d2 + 9) +$:offload_macros.set_async_stream(queue='QUEUE', stream='STREAM', indent=ft.rank - d2 + 9) +$:offload_macros.memcpy_2D_async(dst='HSTPTR', dst_pitch='IHST_PITCH', src='DEVPTR', src_pitch='IDEV_PITCH', & +& width='IWIDTH', height='IHEIGHT', stream='STREAM', return_val='IRET', indent=ft.rank - d2 + 9) ${indent}$ ELSE -$:offload_macros.Copy2D(dst='HSTPTR', dst_pitch='IHST_PITCH', src='DEVPTR', src_pitch='IDEV_PITCH', & -& width='IWIDTH', height='IHEIGHT', return_val='IRET', indent=ft.rank - d2 + 9) +$:offload_macros.memcpy_2D(dst='HSTPTR', dst_pitch='IHST_PITCH', src='DEVPTR', src_pitch='IDEV_PITCH', & +& width='IWIDTH', height='IHEIGHT', return_val='IRET', indent=ft.rank - d2 + 9) ${indent}$ ENDIF ${indent}$ IF (IRET /= 0) THEN ${indent}$ CALL FIELD_ABORT ("${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS: DEVICE-TO-HOST TRANSFER FAILED") diff --git a/src/core/field_RANKSUFF_module.fypp b/src/core/field_RANKSUFF_module.fypp index 884e5f0..7cdd2ba 100644 --- a/src/core/field_RANKSUFF_module.fypp +++ b/src/core/field_RANKSUFF_module.fypp @@ -389,7 +389,7 @@ CONTAINS SELF%LOBJECT_COPIED = .TRUE. #ifdef WITH_GPU_OFFLOAD IF (ASSOCIATED (SELF%DEVPTR)) THEN -$:offload_macros.AttachDevPtr(ptr='SELF%DEVPTR', indent=6) +$:offload_macros.attach(ptr='SELF%DEVPTR', indent=6) ENDIF #endif ENDIF @@ -413,7 +413,7 @@ $:offload_macros.AttachDevPtr(ptr='SELF%DEVPTR', indent=6) SELF%LOBJECT_COPIED = .FALSE. #ifdef WITH_GPU_OFFLOAD IF (ASSOCIATED (SELF%DEVPTR)) THEN -$:offload_macros.DetachDevPtr(ptr='SELF%DEVPTR', indent=6) +$:offload_macros.detach(ptr='SELF%DEVPTR', indent=6) ENDIF #endif ENDIF @@ -558,9 +558,9 @@ $:offload_macros.DetachDevPtr(ptr='SELF%DEVPTR', indent=6) ELSEIF (IAND (SELF%GET_STATUS (), NDEVFRESH) /= 0) THEN CALL SELF%GET_DEVICE_DATA_RDONLY (PTR) ALLOCATE (ZZ, MOLD=PTR) -$:offload_macros.LaunchParallelKernel(present=['PTR',], copyout=['ZZ',]) +$:offload_macros.kernels(present=['PTR',], copyout=['ZZ',]) ZZ = PTR -$:offload_macros.EndParallelKernel() +$:offload_macros.end_kernels() ILEN = SIZE (ZZ) * KIND (ZZ) CALL CRC64 (ZZ, ILEN, ICRC) ENDIF diff --git a/src/core/field_async_module.fypp b/src/core/field_async_module.fypp index 4b8b365..ba30e2a 100644 --- a/src/core/field_async_module.fypp +++ b/src/core/field_async_module.fypp @@ -19,7 +19,7 @@ USE PARKIND1, ONLY : JPIM INTEGER(KIND=JPIM), INTENT(IN) :: QUEUE !Wait for all data transfer initiated on queue by the current thread -$:offload_macros.WaitAsyncStream(stream='QUEUE') +$:offload_macros.async_wait(stream='QUEUE') END SUBROUTINE WAIT_FOR_ASYNC_QUEUE END MODULE FIELD_ASYNC_MODULE diff --git a/src/core/host_alloc_module.fypp b/src/core/host_alloc_module.fypp index ef97ed7..f933c9f 100644 --- a/src/core/host_alloc_module.fypp +++ b/src/core/host_alloc_module.fypp @@ -66,8 +66,8 @@ END INTERFACE #:if defined('WITH_HIC') INTERFACE -$:offload_macros.HostRegisterIntf(indent=3) -$:offload_macros.HostUnregisterIntf(indent=3) +$:offload_macros.host_register_intf(indent=3) +$:offload_macros.host_unregister_intf(indent=3) END INTERFACE #:endif @@ -253,12 +253,12 @@ SUBROUTINE PIN_ALLOCATION(DATA, ARR_SIZE) TYPE(C_PTR), INTENT(INOUT) :: DATA INTEGER(C_SIZE_T), INTENT(IN) :: ARR_SIZE -$:offload_macros.RuntimeErrorType(symbols=['ISTAT',], indent=3) +$:offload_macros.runtime_error_decl(symbols=['ISTAT',], indent=3) -$:offload_macros.RegisterHostDeclFlags(flag_var='FLAGS', indent=3) +$:offload_macros.register_host_decl_flags(flag_var='FLAGS', indent=3) -$:offload_macros.RegisterHostSetFlags(flag_var='FLAGS', val='2', indent=3) -$:offload_macros.RegisterHost(ptr='DATA', size='ARR_SIZE', flags='FLAGS', return_val='ISTAT') +$:offload_macros.register_host_set_flags(flag_var='FLAGS', val='2', indent=3) +$:offload_macros.register_host(ptr='DATA', size='ARR_SIZE', flags='FLAGS', return_val='ISTAT') IF (ISTAT /= 0) THEN CALL FIELD_ABORT ("FAILED TO REGISTER IN PAGE-LOCKED MEMORY") @@ -269,9 +269,9 @@ END SUBROUTINE PIN_ALLOCATION SUBROUTINE UNPIN_ALLOCATION(DATA) TYPE(C_PTR), INTENT(INOUT) :: DATA -$:offload_macros.RuntimeErrorType(symbols=['ISTAT',], indent=3) +$:offload_macros.runtime_error_decl(symbols=['ISTAT',], indent=3) -$:offload_macros.UnregisterHost(ptr='DATA', return_val='ISTAT', indent=3) +$:offload_macros.unregister_host(ptr='DATA', return_val='ISTAT', indent=3) IF (ISTAT /= 0) THEN CALL FIELD_ABORT ("FAILED TO UNREGISTER PAGE-LOCKED MEMORY") diff --git a/src/shuffle/field_RANKSUFF_shuffle_module.fypp b/src/shuffle/field_RANKSUFF_shuffle_module.fypp index 8a7d612..bf41939 100644 --- a/src/shuffle/field_RANKSUFF_shuffle_module.fypp +++ b/src/shuffle/field_RANKSUFF_shuffle_module.fypp @@ -158,13 +158,13 @@ INTEGER (KIND=JPIM) :: ${ind}$ #:endif #:if what == 'DEVICE' -$:offload_macros.LaunchParallelLoop(gang=True, present=['PTRG', 'PTRS', 'KNDS']) +$:offload_macros.parallel_loop(gang=True, present=['PTRG', 'PTRS', 'KNDS']) #:elif what == 'HOST' !$OMP PARALLEL DO PRIVATE (${ind}$JBLKG, JLONG, JBLKS, JLONS) #:endif DO JBLKG = 1, SIZE (KNDS, 3) #:if what == 'DEVICE' -$:offload_macros.AnnotateParallelLoop(vector=True, private=[f'{ind}JLONG', 'JBLKS', 'JLONS']) +$:offload_macros.annotate_loop(vector=True, private=[f'{ind}JLONG', 'JBLKS', 'JLONS']) #:endif DO JLONG = 1, SIZE (KNDS, 2) JLONS = KNDS (NLONDIM, JLONG, JBLKG) @@ -253,13 +253,13 @@ INTEGER (KIND=JPIM) :: ${ind}$ #:endif #:if what == 'DEVICE' -$:offload_macros.LaunchParallelLoop(gang=True, present=['PTRG', 'PTRS', 'KNDS']) +$:offload_macros.parallel_loop(gang=True, present=['PTRG', 'PTRS', 'KNDS']) #:elif what == 'HOST' !$OMP PARALLEL DO PRIVATE (${ind}$JBLKG, JLONG, JBLKS, JLONS) #:endif DO JBLKG = 1, SIZE (KNDS, 3) #:if what == 'DEVICE' -$:offload_macros.AnnotateParallelLoop(vector=True, private=[f'{ind}JLONG', 'JBLKS', 'JLONS']) +$:offload_macros.annotate_loop(vector=True, private=[f'{ind}JLONG', 'JBLKS', 'JLONS']) #:endif DO JLONG = 1, SIZE (KNDS, 2) JLONS = KNDS (NLONDIM, JLONG, JBLKG) diff --git a/src/util/field_RANKSUFF_access_module.fypp b/src/util/field_RANKSUFF_access_module.fypp index ca2b78d..4d658d9 100644 --- a/src/util/field_RANKSUFF_access_module.fypp +++ b/src/util/field_RANKSUFF_access_module.fypp @@ -44,7 +44,7 @@ PUBLIC :: GET_${what}$_DATA_${mode}$ #:for ft in fieldTypeList ${ft.type}$, TARGET, SAVE :: DUMMY_${ft.name}$ (${ ', '.join ([dumsize] * (ft.rank-1) + ['1']) }$) -$:offload_macros.Declare(create=[f'DUMMY_{ft.name}',]) +$:offload_macros.declare(create=[f'DUMMY_{ft.name}',]) #:endfor diff --git a/src/util/field_RANKSUFF_array_util_module.fypp b/src/util/field_RANKSUFF_array_util_module.fypp index 79a9aa3..bc7be6b 100644 --- a/src/util/field_RANKSUFF_array_util_module.fypp +++ b/src/util/field_RANKSUFF_array_util_module.fypp @@ -66,14 +66,14 @@ LLCREATED = .FALSE. IF (PRESENT (LDCREATED)) LLCREATED = LDCREATED IF (.NOT. LLCREATED) THEN -$:offload_macros.HostMappedDevAlloc(data=['SELF',], indent=2) -$:offload_macros.UpdateDevice(data=['SELF',], indent=2) +$:offload_macros.create(symbols=['SELF',], indent=2) +$:offload_macros.update_device(symbols=['SELF',], indent=2) ENDIF -$:offload_macros.LaunchSerialKernel(present=['SELF',]) +$:offload_macros.kernels(present=['SELF',]) NULLIFY (SELF%P) NULLIFY (SELF%F_P) -$:offload_macros.EndSerialKernel() +$:offload_macros.end_kernels() IF (LLFIELDAPI .AND. ASSOCIATED (SELF%F_P)) THEN CALL COPY (SELF%F_P, LDCREATED) @@ -100,7 +100,7 @@ IF (LLFIELDAPI .AND. ASSOCIATED (SELF%F_P)) THEN ENDIF IF (.NOT. LLDELETED) THEN -$:offload_macros.HostMappedDevFree(data=['SELF',], indent=2) +$:offload_macros.delete(symbols=['SELF',], indent=2) ENDIF END SUBROUTINE diff --git a/src/util/field_RANKSUFF_util_module.fypp b/src/util/field_RANKSUFF_util_module.fypp index 541620c..8d3993b 100644 --- a/src/util/field_RANKSUFF_util_module.fypp +++ b/src/util/field_RANKSUFF_util_module.fypp @@ -162,9 +162,9 @@ SUBROUTINE LEGACY_${ft.name}$_ASSIGN (PTR_RHS, PTR_LHS) ${ft.type}$ :: PTR_RHS (${ft.shape}$), PTR_LHS (${ft.shape}$) -$:offload_macros.LaunchParallelKernel(present=['PTR_RHS', 'PTR_LHS']) +$:offload_macros.kernels(present=['PTR_RHS', 'PTR_LHS']) PTR_RHS = PTR_LHS -$:offload_macros.EndParallelKernel() +$:offload_macros.end_kernels() END SUBROUTINE