From 2e31f5ad108d30f9ebfe7ede57e5283b51c6ac2c Mon Sep 17 00:00:00 2001 From: Scott J Breyer Date: Mon, 2 Oct 2023 10:26:43 -0400 Subject: [PATCH] PSM3 OFI Provider from IEFS 11_5_1_1_1 (#9) Ensure 11.5.1.1 release matches SRPM found in IEFS release Signed-off-by: Scott Breyer --- VERSION | 2 +- configure | 22 ++-- debian/changelog | 2 +- libpsm3-fi.spec | 4 +- man/man7/fi_psm3.7 | 2 +- psm3/hal_sockets/sockets_ep.c | 3 + psm3/hal_verbs/verbs_ep.c | 31 +++++ psm3/psm.c | 83 +++++++++---- psm3/psm_oneapi_ze.c | 122 ++++++++++++++++-- psm3/psm_sysbuf.c | 27 +++- psm3/psm_user.h | 199 +++++++++++------------------- psm3/ptl_ips/ips_expected_proto.h | 4 +- psm3/ptl_ips/ips_proto.h | 4 +- 13 files changed, 321 insertions(+), 184 deletions(-) diff --git a/VERSION b/VERSION index e069fd0..8eef61c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -11_5_1_0 +11_5_1_1 diff --git a/configure b/configure index 779e8dc..109100c 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for libpsm3-fi 11.5.1.0. +# Generated by GNU Autoconf 2.69 for libpsm3-fi 11.5.1.1. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -587,8 +587,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='libpsm3-fi' PACKAGE_TARNAME='libpsm3-fi' -PACKAGE_VERSION='11.5.1.0' -PACKAGE_STRING='libpsm3-fi 11.5.1.0' +PACKAGE_VERSION='11.5.1.1' +PACKAGE_STRING='libpsm3-fi 11.5.1.1' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1366,7 +1366,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures libpsm3-fi 11.5.1.0 to adapt to many kinds of systems. +\`configure' configures libpsm3-fi 11.5.1.1 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1436,7 +1436,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of libpsm3-fi 11.5.1.0:";; + short | recursive ) echo "Configuration of libpsm3-fi 11.5.1.1:";; esac cat <<\_ACEOF @@ -1589,7 +1589,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -libpsm3-fi configure 11.5.1.0 +libpsm3-fi configure 11.5.1.1 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by libpsm3-fi $as_me 11.5.1.0, which was +It was created by libpsm3-fi $as_me 11.5.1.1, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2869,7 +2869,7 @@ fi # Define the identity of the package. PACKAGE='libpsm3-fi' - VERSION='11.5.1.0' + VERSION='11.5.1.1' cat >>confdefs.h <<_ACEOF @@ -14045,7 +14045,7 @@ Usage: $0 [OPTIONS] Report bugs to ." lt_cl_version="\ -libpsm3-fi config.lt 11.5.1.0 +libpsm3-fi config.lt 11.5.1.1 configured by $0, generated by GNU Autoconf 2.69. Copyright (C) 2011 Free Software Foundation, Inc. @@ -17319,7 +17319,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by libpsm3-fi $as_me 11.5.1.0, which was +This file was extended by libpsm3-fi $as_me 11.5.1.1, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -17385,7 +17385,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -libpsm3-fi config.status 11.5.1.0 +libpsm3-fi config.status 11.5.1.1 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/debian/changelog b/debian/changelog index a4688da..7eaab21 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -libpsm3-fi (11.5.1.0-3) unstable; urgency=medium +libpsm3-fi (11.5.1.1-1) unstable; urgency=medium * Initial release diff --git a/libpsm3-fi.spec b/libpsm3-fi.spec index b33feb2..dd22e69 100644 --- a/libpsm3-fi.spec +++ b/libpsm3-fi.spec @@ -3,8 +3,8 @@ %{!?provider_formal: %define provider_formal PSM3} Name: lib%{provider}-fi -Version: 11.5.1.0 -Release: 3 +Version: 11.5.1.1 +Release: 1 Summary: Dynamic %{provider_formal} provider for Libfabric Group: System Environment/Libraries diff --git a/man/man7/fi_psm3.7 b/man/man7/fi_psm3.7 index 1d448fb..54ff96e 100644 --- a/man/man7/fi_psm3.7 +++ b/man/man7/fi_psm3.7 @@ -1,6 +1,6 @@ .\" Automatically generated by Pandoc 2.9.2.1 .\" -.TH "fi_psm3" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "Libfabric v11.5.1.0" +.TH "fi_psm3" "7" "2022\-12\-09" "Libfabric Programmer\[cq]s Manual" "Libfabric v11.5.1.1" .hy .SH NAME .PP diff --git a/psm3/hal_sockets/sockets_ep.c b/psm3/hal_sockets/sockets_ep.c index a54407d..8e095b7 100755 --- a/psm3/hal_sockets/sockets_ep.c +++ b/psm3/hal_sockets/sockets_ep.c @@ -816,6 +816,9 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key) // GPU Direct is enabled and we need a GPU Cache loc_info.rdma_mode = RV_RDMA_MODE_GPU_ONLY; +#ifdef PSM_ONEAPI + psm3_oneapi_ze_can_use_zemem(); +#endif // need portnum for rdma_mode KERNEL or (USER|GPU) loc_info.port_num = ep->portnum; diff --git a/psm3/hal_verbs/verbs_ep.c b/psm3/hal_verbs/verbs_ep.c index 50db25b..979787b 100644 --- a/psm3/hal_verbs/verbs_ep.c +++ b/psm3/hal_verbs/verbs_ep.c @@ -796,6 +796,15 @@ psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd, _HFI_ERROR( "can't alloc send buffers"); goto fail; } +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + // By registering memory with Level Zero, we make + // zeCommandListAppendMemoryCopy run faster for copies from + // GPU to the send buffer. + if (PSMI_IS_GPU_ENABLED) + PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, + ze_driver, pool->send_buffers, + pool->send_total*pool->send_buffer_size); +#endif _HFI_PRDBG("send pool: buffers: %p size %u\n", pool->send_buffers, pool->send_buffer_size); pool->send_bufs = (struct verbs_sbuf *)psmi_calloc(ep, NETWORK_BUFFERS, @@ -883,6 +892,15 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp, _HFI_ERROR( "can't alloc recv buffers"); goto fail; } +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + // By registering memory with Level Zero, we make + // zeCommandListAppendMemoryCopy run faster for copies from + // recv buffer to GPU + if (PSMI_IS_GPU_ENABLED) + PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, + ze_driver, pool->recv_buffers, + pool->recv_total*pool->recv_buffer_size); +#endif //printf("recv pool: buffers: %p size %u\n", pool->recv_buffers, pool->recv_buffer_size); #ifdef USE_RC pool->recv_bufs = (struct verbs_rbuf *)psmi_calloc(ep, NETWORK_BUFFERS, @@ -971,6 +989,11 @@ void psm_verbs_free_send_pool(psm3_verbs_send_pool_t pool) pool->send_bufs = NULL; } if (pool->send_buffers) { +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + if (PSMI_IS_GPU_ENABLED) + PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, + ze_driver, pool->send_buffers); +#endif psmi_free(pool->send_buffers); pool->send_buffers = NULL; } @@ -991,6 +1014,11 @@ void psm_verbs_free_recv_pool(psm3_verbs_recv_pool_t pool) } #endif if (pool->recv_buffers) { +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + if (PSMI_IS_GPU_ENABLED) + PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, + ze_driver, pool->recv_buffers); +#endif psmi_free(pool->recv_buffers); pool->recv_buffers = NULL; } @@ -1968,6 +1996,9 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key) if (psmi_parse_gpudirect()) { // When GPU Direct is enabled we need a GPU Cache loc_info.rdma_mode |= RV_RDMA_MODE_GPU; +#ifdef PSM_ONEAPI + psm3_oneapi_ze_can_use_zemem(); +#endif if ((ep->rdmamode & IPS_PROTOEXP_FLAG_ENABLED) && (psmi_parse_gpudirect_rdma_send_limit(1) || psmi_parse_gpudirect_rdma_recv_limit(1))) { diff --git a/psm3/psm.c b/psm3/psm.c index ad0cdf0..40826d3 100644 --- a/psm3/psm.c +++ b/psm3/psm.c @@ -366,6 +366,11 @@ static void psmi_cuda_stats_register() ze_result_t (*psmi_zeInit)(ze_init_flags_t flags); ze_result_t (*psmi_zeDriverGet)(uint32_t *pCount, ze_driver_handle_t *phDrivers); ze_result_t (*psmi_zeDeviceGet)(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices); +#ifndef PSM3_NO_ONEAPI_IMPORT +ze_result_t (*psmi_zeDriverGetExtensionFunctionAddress)(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress); +ze_result_t (*psmi_zexDriverImportExternalPointer)(ze_driver_handle_t hDriver, void *ptr, size_t size); +ze_result_t (*psmi_zexDriverReleaseImportedPointer)(ze_driver_handle_t hDriver, void *ptr); +#endif ze_result_t (*psmi_zeContextCreate)(ze_driver_handle_t hDriver, const ze_context_desc_t *desc, ze_context_handle_t *phContext); ze_result_t (*psmi_zeContextDestroy)(ze_context_handle_t hContext); ze_result_t (*psmi_zeCommandQueueCreate)(ze_context_handle_t hContext, ze_device_handle_t hDevice,const ze_command_queue_desc_t *desc, ze_command_queue_handle_t *phCommandQueue); @@ -406,6 +411,11 @@ ze_result_t (*psmi_zelLoaderGetVersions)(size_t *num_elems, zel_component_versio uint64_t psmi_count_zeInit; uint64_t psmi_count_zeDriverGet; uint64_t psmi_count_zeDeviceGet; +#ifndef PSM3_NO_ONEAPI_IMPORT +uint64_t psmi_count_zeDriverGetExtensionFunctionAddress; +uint64_t psmi_count_zexDriverImportExternalPointer; +uint64_t psmi_count_zexDriverReleaseImportedPointer; +#endif uint64_t psmi_count_zeContextCreate; uint64_t psmi_count_zeContextDestroy; uint64_t psmi_count_zeCommandQueueCreate; @@ -463,6 +473,9 @@ int psmi_oneapi_ze_load() PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeInit); PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDriverGet); PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDeviceGet); +#ifndef PSM3_NO_ONEAPI_IMPORT + PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDriverGetExtensionFunctionAddress); +#endif PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeContextCreate); PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeContextDestroy); PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandQueueCreate); @@ -522,6 +535,11 @@ static void psmi_oneapi_ze_stats_register() PSMI_ONEAPI_ZE_COUNT_DECLU64(zeInit), PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDriverGet), PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDeviceGet), +#ifndef PSM3_NO_ONEAPI_IMPORT + PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDriverGetExtensionFunctionAddress), + PSMI_ONEAPI_ZE_COUNT_DECLU64(zexDriverImportExternalPointer), + PSMI_ONEAPI_ZE_COUNT_DECLU64(zexDriverReleaseImportedPointer), +#endif PSMI_ONEAPI_ZE_COUNT_DECLU64(zeContextCreate), PSMI_ONEAPI_ZE_COUNT_DECLU64(zeContextDestroy), PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueCreate), @@ -738,39 +756,38 @@ static void psmi_oneapi_find_copy_only_engine(ze_device_handle_t dev, psmi_free(props); } +// create command queue for use in psmi_oneapi_ze_memcpy for sync memcpy static void psmi_oneapi_cmd_create(ze_device_handle_t dev, struct ze_dev_ctxt *ctxt) { ze_command_queue_desc_t ze_cq_desc = { .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, .flags = 0, -#ifdef PSM3_USE_ONEAPI_IMMEDIATE - .mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, -#else - .mode = ZE_COMMAND_QUEUE_MODE_DEFAULT, -#endif + //.mode set below .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, }; -#ifndef PSM3_USE_ONEAPI_IMMEDIATE - ze_command_list_desc_t ze_cl_desc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, - .flags = 0 - }; -#endif psmi_oneapi_find_copy_only_engine(dev, ctxt); ze_cq_desc.ordinal = ctxt->ordinal; ze_cq_desc.index = ctxt->index; -#ifdef PSM3_USE_ONEAPI_IMMEDIATE - PSMI_ONEAPI_ZE_CALL(zeCommandListCreateImmediate, ze_context, dev, - &ze_cq_desc, &ctxt->cl); -#else - PSMI_ONEAPI_ZE_CALL(zeCommandQueueCreate, ze_context, dev, - &ze_cq_desc, &ctxt->cq); - ze_cl_desc.commandQueueGroupOrdinal = ctxt->ordinal; - PSMI_ONEAPI_ZE_CALL(zeCommandListCreate, ze_context, dev, &ze_cl_desc, - &ctxt->cl); -#endif + if (psm3_oneapi_immed_sync_copy) { + ze_cq_desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; + PSMI_ONEAPI_ZE_CALL(zeCommandListCreateImmediate, ze_context, + dev, &ze_cq_desc, &ctxt->cl); + } else { + ze_command_list_desc_t ze_cl_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + .flags = 0 + }; + ze_cq_desc.mode = ZE_COMMAND_QUEUE_MODE_DEFAULT; + + PSMI_ONEAPI_ZE_CALL(zeCommandQueueCreate, ze_context, + dev, &ze_cq_desc, &ctxt->cq); + + ze_cl_desc.commandQueueGroupOrdinal = ctxt->ordinal; + PSMI_ONEAPI_ZE_CALL(zeCommandListCreate, ze_context, + dev, &ze_cl_desc, &ctxt->cl); + } ctxt->dev = dev; } @@ -787,11 +804,7 @@ void psmi_oneapi_cmd_create_all(void) for (i = 0; i < num_ze_devices; i++) { ctxt = &ze_devices[i]; -#ifdef PSM3_USE_ONEAPI_IMMEDIATE if (!ctxt->cl) -#else - if (!ctxt->cq || !ctxt->cl) -#endif psmi_oneapi_cmd_create(ctxt->dev, ctxt); } if (num_ze_devices > 0) @@ -810,12 +823,10 @@ void psmi_oneapi_cmd_destroy_all(void) PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->cl); ctxt->cl = NULL; } -#ifndef PSM3_USE_ONEAPI_IMMEDIATE if (ctxt->cq) { PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->cq); ctxt->cq = NULL; } -#endif } cur_ze_dev = NULL; @@ -835,6 +846,7 @@ int psmi_oneapi_ze_initialize() zel_component_version_t *zel_comps = NULL; size_t num_zel_comps; int i; + union psmi_envvar_val env; PSM2_LOG_MSG("entering"); _HFI_VDBG("Init Level Zero library.\n"); @@ -844,6 +856,19 @@ int psmi_oneapi_ze_initialize() if (err != PSM2_OK) goto fail; + psm3_getenv("PSM3_ONEAPI_IMMED_SYNC_COPY", + "Use Immediate CommandList for synchronous copy to/from GPU]", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)1, &env); + psm3_oneapi_immed_sync_copy = env.e_int; + + psm3_getenv("PSM3_ONEAPI_IMMED_ASYNC_COPY", + "Use Immediate CommandList for asynchronous pipeline copy to/from GPU]", + PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT, + (union psmi_envvar_val)1, &env); + psm3_oneapi_immed_async_copy = env.e_int; + + PSMI_ONEAPI_ZE_CALL(zeInit, ZE_INIT_FLAG_GPU_ONLY); /* Need to query count before alloc array */ @@ -874,6 +899,10 @@ int psmi_oneapi_ze_initialize() } PSMI_ONEAPI_ZE_CALL(zeDriverGet, &ze_driver_count, &ze_driver); +#ifndef PSM3_NO_ONEAPI_IMPORT + PSMI_ONEAPI_ZE_CALL(zeDriverGetExtensionFunctionAddress, ze_driver, "zexDriverImportExternalPointer", (void **)&psmi_zexDriverImportExternalPointer); + PSMI_ONEAPI_ZE_CALL(zeDriverGetExtensionFunctionAddress, ze_driver, "zexDriverReleaseImportedPointer", (void **)&psmi_zexDriverReleaseImportedPointer); +#endif PSMI_ONEAPI_ZE_CALL(zeDeviceGet, ze_driver, &ze_device_count, NULL); if (ze_device_count > MAX_ZE_DEVICES) diff --git a/psm3/psm_oneapi_ze.c b/psm3/psm_oneapi_ze.c index 237aac4..568581a 100644 --- a/psm3/psm_oneapi_ze.c +++ b/psm3/psm_oneapi_ze.c @@ -68,6 +68,8 @@ static int psm3_ze_dev_fds[MAX_ZE_DEVICES]; int psm3_num_ze_dev_fds; #endif +int psm3_oneapi_immed_sync_copy; +int psm3_oneapi_immed_async_copy; const char* psmi_oneapi_ze_result_to_string(const ze_result_t result) { #define ZE_RESULT_CASE(RES) case ZE_RESULT_##RES: return STRINGIFY(RES) @@ -114,6 +116,65 @@ const char* psmi_oneapi_ze_result_to_string(const ze_result_t result) { #undef ZE_RESULT_CASE } +// when allocating bounce buffers either malloc w/Import or +// zeMemAllocHost can be used. zeMemAllocHost tends to perform +// better in the subsequent GPU copy's AppendMemoryCopy. However +// zeMemAllocHost results in a GPU-like address which requires dmabuf +// so we can't use zeMemAllocHost for DMA to/from the bounce buffer +// unless rv is available to handle GPU addresses (eg. PSM3_GPUDIRECT=1) + +void *psm3_oneapi_ze_host_alloc_malloc(unsigned size) +{ + void *ret_ptr = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size); +#ifndef PSM3_NO_ONEAPI_IMPORT + PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, ret_ptr, size); +#endif + return ret_ptr; +} + +void psm3_oneapi_ze_host_free_malloc(void *ptr) +{ +#ifndef PSM3_NO_ONEAPI_IMPORT + PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, ptr); +#endif + psmi_free(ptr); +} + +#ifndef PSM3_USE_ONEAPI_MALLOC +void *psm3_oneapi_ze_host_alloc_zemem(unsigned size) +{ + void *ret_ptr; + ze_host_mem_alloc_desc_t host_desc = { + .stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, + .flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW + }; + PSMI_ONEAPI_ZE_CALL(zeMemAllocHost, ze_context, + &host_desc, size, 8, &ret_ptr); + return ret_ptr; +} + +void psm3_oneapi_ze_host_free_zemem(void *ptr) +{ + PSMI_ONEAPI_ZE_CALL(zeMemFree, ze_context, ptr); +} + +void *(*psm3_oneapi_ze_host_alloc)(unsigned size) = psm3_oneapi_ze_host_alloc_malloc; +void (*psm3_oneapi_ze_host_free)(void *ptr) = psm3_oneapi_ze_host_free_malloc; +int psm3_oneapi_ze_using_zemem_alloc = 0; +#endif /* PSM3_USE_ONEAPI_MALLOC */ + +// this is only called if GPU Direct is enabled in rv such that +// GDR Copy and/or RDMA MRs can provide GPU-like addresses to rv +void psm3_oneapi_ze_can_use_zemem() +{ +#ifndef PSM3_USE_ONEAPI_MALLOC + psm3_oneapi_ze_host_alloc = psm3_oneapi_ze_host_alloc_zemem; + psm3_oneapi_ze_host_free = psm3_oneapi_ze_host_free_zemem; + psm3_oneapi_ze_using_zemem_alloc = 1; +#endif +} + +// synchronous GPU memcpy void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size) { struct ze_dev_ctxt *ctxt; @@ -128,13 +189,60 @@ void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size) return; } } - PSM3_ONEAPI_ZE_RESET_COMMAND_LIST(ctxt->cl); - PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, dstptr, srcptr, size, NULL, 0, NULL); - PSM3_ONEAPI_ZE_CLOSE_COMMAND_LIST(ctxt->cl); - PSM3_ONEAPI_ZE_EXECUTE_COMMAND_LIST(ctxt->cq, ctxt->cl, NULL); -#ifndef PSM3_USE_ONEAPI_IMMEDIATE - PSMI_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX); -#endif + if (psm3_oneapi_immed_sync_copy) { + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, + dstptr, srcptr, size, NULL, 0, NULL); + } else { + PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->cl); + PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl, + dstptr, srcptr, size, NULL, 0, NULL); + PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->cl); + PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->cq, + 1, &ctxt->cl, NULL); + PSMI_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX); + } +} + +// for pipelined async GPU memcpy +// *p_cq is left as NULL when psm3_oneapi_immed_async_copy enabled +void psmi_oneapi_async_cmd_create(struct ze_dev_ctxt *ctxt, + ze_command_queue_handle_t *p_cq, ze_command_list_handle_t *p_cl) +{ + psmi_assert(! *p_cl); + if (psm3_oneapi_immed_async_copy) { + ze_command_queue_desc_t cq_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + .flags = 0, + .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL + }; + cq_desc.ordinal = ctxt->ordinal; + cq_desc.index = ctxt->index++; + ctxt->index %= ctxt->num_queues; + PSMI_ONEAPI_ZE_CALL(zeCommandListCreateImmediate, + ze_context, ctxt->dev, &cq_desc, p_cl); + } else { + if (! *p_cq) { + ze_command_queue_desc_t cq_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + .flags = 0, + .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL + }; + cq_desc.ordinal = ctxt->ordinal; + cq_desc.index = ctxt->index++; + ctxt->index %= ctxt->num_queues; + PSMI_ONEAPI_ZE_CALL(zeCommandQueueCreate, + ze_context, ctxt->dev, &cq_desc, p_cq); + } + ze_command_list_desc_t cl_desc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, + .flags = 0 + }; + cl_desc.commandQueueGroupOrdinal = ctxt->ordinal; + PSMI_ONEAPI_ZE_CALL(zeCommandListCreate, + ze_context, ctxt->dev, &cl_desc, p_cl); + } } #ifndef PSM_HAVE_PIDFD diff --git a/psm3/psm_sysbuf.c b/psm3/psm_sysbuf.c index 111edea..f9bee0b 100644 --- a/psm3/psm_sysbuf.c +++ b/psm3/psm_sysbuf.c @@ -125,6 +125,10 @@ void psm3_mq_sysbuf_fini(psm2_mq_t mq) // free all buffers that is currently no for (i=0; i < MM_NUM_OF_POOLS; i++) { while ((block = mq->handler_index[i].free_list) != NULL) { mq->handler_index[i].free_list = block->next; +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + if (PSMI_IS_GPU_ENABLED) + PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block); +#endif psmi_free(block); } } @@ -164,6 +168,13 @@ void *psm3_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size) new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); if (new_block) { +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + // for transient buffers, no use Importing, adds cost for + // CPU copy, just pay GPU cost on the copy, we use once & free + //if (PSMI_IS_GPU_ENABLED) + // PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, + // new_block, newsz); +#endif new_block->mem_handler = mm_handler; new_block++; mm_handler->total_alloc++; @@ -176,11 +187,19 @@ void *psm3_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size) uint32_t newsz = mm_handler->block_size + sizeof(struct psmi_mem_block_ctrl); new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz); - mq->mem_ctrl_total_bytes += newsz; if (new_block) { +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + // By registering memory with Level Zero, we make + // zeCommandListAppendMemoryCopy run faster for copies between + // GPU and this sysbuf + if (PSMI_IS_GPU_ENABLED) + PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, + new_block, newsz); +#endif mm_handler->current_available++; mm_handler->total_alloc++; + mq->mem_ctrl_total_bytes += newsz; new_block->next = mm_handler->free_list; mm_handler->free_list = new_block; @@ -214,6 +233,12 @@ void psm3_mq_sysbuf_free(psm2_mq_t mq, void * mem_to_free) mm_handler = block_to_free->mem_handler; if (mm_handler->flags & MM_FLAG_TRANSIENT) { +#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT) + // for transient buffers, no use Importing, adds cost for + // CPU copy, just pay GPU cost on the copy, we use once & free + //if (PSMI_IS_GPU_ENABLED) + // PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block); +#endif psmi_free(block_to_free); } else { block_to_free->next = mm_handler->free_list; diff --git a/psm3/psm_user.h b/psm3/psm_user.h index 932fed6..38e9b8d 100644 --- a/psm3/psm_user.h +++ b/psm3/psm_user.h @@ -61,13 +61,14 @@ extern "C" { #endif #if defined(PSM_ONEAPI) -// if defined, will use immediate command lists (just Append) -// if not defined, use normal command lists (with Reset, Append, Close, Execute) -//#define PSM3_USE_ONEAPI_IMMEDIATE - // if defined, use malloc for pipeline copy bounce buffers // otherwise, use zeMemAllocHost //#define PSM3_USE_ONEAPI_MALLOC + +// if defined, do not use zexDriverImportExternalPointer for malloced pipeline +// copy bounce buffers +// otherwise, use zexDriverImportExternalPointer when malloc buffer +//#define PSM3_NO_ONEAPI_IMPORT #endif /* Instead of testing a HAL cap mask bit at runtime (in addition to thresholds), @@ -430,9 +431,7 @@ struct ze_dev_ctxt { uint32_t ordinal; /* CmdQGrp ordinal for the 1st copy_only engine */ uint32_t index; /* Cmdqueue index within the CmdQGrp */ uint32_t num_queues; /* Number of queues in the CmdQGrp */ -#ifndef PSM3_USE_ONEAPI_IMMEDIATE - ze_command_queue_handle_t cq; -#endif + ze_command_queue_handle_t cq; // NULL if psm3_oneapi_immed_sync_copy ze_command_list_handle_t cl; }; @@ -443,8 +442,12 @@ extern ze_driver_handle_t ze_driver; extern struct ze_dev_ctxt ze_devices[MAX_ZE_DEVICES]; extern int num_ze_devices; extern struct ze_dev_ctxt *cur_ze_dev; +extern int psm3_oneapi_immed_sync_copy; +extern int psm3_oneapi_immed_async_copy; const char* psmi_oneapi_ze_result_to_string(const ze_result_t result); +void psmi_oneapi_async_cmd_create(struct ze_dev_ctxt *ctxt, + ze_command_queue_handle_t *p_cq, ze_command_list_handle_t *p_cl); #ifndef PSM_HAVE_PIDFD psm2_error_t psm3_sock_detach(ptl_t *ptl_gen); psm2_error_t psm3_ze_init_ipc_socket(ptl_t *ptl_gen); @@ -453,6 +456,16 @@ psm2_error_t psm3_check_dev_fds_exchanged(ptl_t *ptl_gen, psm2_epaddr_t epaddr); psm2_error_t psm3_poll_dev_fds_exchange(ptl_t *ptl_gen); #endif +#ifdef PSM3_USE_ONEAPI_MALLOC +void *psm3_oneapi_ze_host_alloc_malloc(unsigned size); +void psm3_oneapi_ze_host_free_malloc(void *ptr); +#else +extern void *(*psm3_oneapi_ze_host_alloc)(unsigned size); +extern void (*psm3_oneapi_ze_host_free)(void *ptr); +extern int psm3_oneapi_ze_using_zemem_alloc; +#endif +extern void psm3_oneapi_ze_can_use_zemem(); + void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size); static inline @@ -509,7 +522,14 @@ extern cudaError_t (*psmi_cudaRuntimeGetVersion)(int* runtimeVersion); #ifdef PSM_ONEAPI extern ze_result_t (*psmi_zeInit)(ze_init_flags_t flags); extern ze_result_t (*psmi_zeDriverGet)(uint32_t *pCount, ze_driver_handle_t *phDrivers); +#ifndef PSM3_NO_ONEAPI_IMPORT +extern ze_result_t (*psmi_zexDriverImportExternalPointer)(ze_driver_handle_t hDriver, void *ptr, size_t size); +extern ze_result_t (*psmi_zexDriverReleaseImportedPointer)(ze_driver_handle_t hDriver, void *ptr); +#endif extern ze_result_t (*psmi_zeDeviceGet)(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices); +#ifndef PSM3_NO_ONEAPI_IMPORT +extern ze_result_t (*psmi_zeDriverGetExtensionFunctionAddress)(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress); +#endif extern ze_result_t (*psmi_zeContextCreate)(ze_driver_handle_t hDriver, const ze_context_desc_t *desc, ze_context_handle_t *phContext); extern ze_result_t (*psmi_zeContextDestroy)(ze_context_handle_t hContext); extern ze_result_t (*psmi_zeCommandQueueCreate)(ze_context_handle_t hContext, ze_device_handle_t hDevice,const ze_command_queue_desc_t *desc, ze_command_queue_handle_t *phCommandQueue); @@ -592,7 +612,14 @@ extern uint64_t psmi_count_cudaRuntimeGetVersion; #ifdef PSM_ONEAPI extern uint64_t psmi_count_zeInit; extern uint64_t psmi_count_zeDriverGet; +#ifndef PSM3_NO_ONEAPI_IMPORT +extern uint64_t psmi_count_zexDriverImportExternalPointer; +extern uint64_t psmi_count_zexDriverReleaseImportedPointer; +#endif extern uint64_t psmi_count_zeDeviceGet; +#ifndef PSM3_NO_ONEAPI_IMPORT +extern uint64_t psmi_count_zeDriverGetExtensionFunctionAddress; +#endif extern uint64_t psmi_count_zeContextCreate; extern uint64_t psmi_count_zeContextDestroy; extern uint64_t psmi_count_zeCommandQueueCreate; @@ -1266,92 +1293,6 @@ _psmi_is_gdr_copy_enabled()) #define PSMI_IS_GPU_MEM(x) PSMI_IS_CUDA_MEM(x) #elif defined(PSM_ONEAPI) -#ifdef PSM3_USE_ONEAPI_IMMEDIATE -// only called by FORCE_INIT, we can do nothing and let the subsequent -// HTOD_START or DTOH_START create the command list -#define PSM3_ONEAPI_ZE_CREATE_COMMAND_LIST(hContext, hDevice, phCommandList) \ - do { } while (0) -#define PSM3_ONEAPI_ZE_CREATE_COMMAND_QUEUE_AND_LIST(p, ghb, len, cq) \ - do { \ - if (ghb->command_list == NULL) { \ - ze_command_queue_desc_t cq_desc = { \ - .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,\ - .flags = 0, \ - .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, \ - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL \ - }; \ - cq_desc.ordinal = ctxt->ordinal; \ - cq_desc.index = ctxt->index++; \ - ctxt->index %= ctxt->num_queues; \ - (void)p; /* keep compiler happy that p is used */ \ - PSMI_ONEAPI_ZE_CALL(zeCommandListCreateImmediate, \ - ze_context, ctxt->dev, &cq_desc, \ - &ghb->command_list); \ - } \ - } while (0) - -#define PSM3_ONEAPI_ZE_CLOSE_COMMAND_LIST(hCommandList) \ - do { } while (0) -#define PSM3_ONEAPI_ZE_EXECUTE_COMMAND_LIST(hCommandQueue, hCommandList, hFence) \ - do { } while (0) -#define PSM3_ONEAPI_ZE_RESET_COMMAND_LIST(hCommandList) \ - do { } while (0) - -#define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp) \ - do { } while (0) -#define PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto) \ - do { } while (0) -#define PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp) \ - do { } while (0) -#define PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto) \ - do { } while (0) - -#else /* PSM3_USE_ONEAPI_IMMEDIATE */ - -#define PSM3_ONEAPI_ZE_CREATE_COMMAND_LIST(hContext, hDevice, phCommandList) \ - do { \ - ze_command_list_desc_t cl_desc = { \ - .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, \ - .flags = 0 \ - }; \ - PSMI_ONEAPI_ZE_CALL(zeCommandListCreate, \ - hContext, hDevice, &cl_desc, phCommandList); \ - } while (0) - -#define PSM3_ONEAPI_ZE_CREATE_COMMAND_QUEUE_AND_LIST(p, ghb, len, cq) \ - do { \ - if (cq == NULL) { \ - ze_command_queue_desc_t cq_desc = { \ - .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,\ - .flags = 0, \ - .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, \ - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL \ - }; \ - cq_desc.ordinal = ctxt->ordinal; \ - cq_desc.index = ctxt->index++; \ - ctxt->index %= ctxt->num_queues; \ - PSMI_ONEAPI_ZE_CALL(zeCommandQueueCreate, \ - ze_context, ctxt->dev, &cq_desc, &cq);\ - } \ - if (ghb->command_list == NULL) { \ - ze_command_list_desc_t cl_desc = { \ - .stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC, \ - .flags = 0 \ - }; \ - cl_desc.commandQueueGroupOrdinal = \ - ctxt->ordinal; \ - PSMI_ONEAPI_ZE_CALL(zeCommandListCreate, \ - ze_context, ctxt->dev, &cl_desc, \ - &ghb->command_list); \ - } \ - } while (0) -#define PSM3_ONEAPI_ZE_CLOSE_COMMAND_LIST(hCommandList) \ - PSMI_ONEAPI_ZE_CALL(zeCommandListClose, hCommandList); -#define PSM3_ONEAPI_ZE_EXECUTE_COMMAND_LIST(hCommandQueue, hCommandList, hFence) \ - PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, \ - hCommandQueue, 1, &hCommandList, hFence) -#define PSM3_ONEAPI_ZE_RESET_COMMAND_LIST(hCommandList) \ - PSMI_ONEAPI_ZE_CALL(zeCommandListReset, hCommandList) #define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp) \ do { \ protoexp->cq_recv = NULL; \ @@ -1374,7 +1315,6 @@ _psmi_is_gdr_copy_enabled()) proto->cq_send); \ } \ } while (0) -#endif /* PSM3_USE_ONEAPI_IMMEDIATE */ #define PSM3_GPU_MEMCPY_HTOD_START(protoexp, ghb, len) \ do { \ @@ -1407,16 +1347,21 @@ _psmi_is_gdr_copy_enabled()) ghb->event_pool, &event_desc, \ &ghb->copy_status); \ } \ - PSM3_ONEAPI_ZE_CREATE_COMMAND_QUEUE_AND_LIST( \ - protoexp, ghb, len, \ - protoexp->cq_recv); \ + if (! ghb->command_list) { \ + psmi_oneapi_async_cmd_create(ctxt, \ + &protoexp->cq_recv, &ghb->command_list);\ + } \ PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, \ ghb->command_list, \ ghb->gpu_buf, ghb->host_buf, len, \ ghb->copy_status, 0, NULL); \ - PSM3_ONEAPI_ZE_CLOSE_COMMAND_LIST(ghb->command_list); \ - PSM3_ONEAPI_ZE_EXECUTE_COMMAND_LIST( \ - protoexp->cq_recv, ghb->command_list, NULL); \ + if (! psm3_oneapi_immed_async_copy) { \ + PSMI_ONEAPI_ZE_CALL(zeCommandListClose, \ + ghb->command_list); \ + PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,\ + protoexp->cq_recv, 1, \ + &ghb->command_list, NULL); \ + } \ } while (0) #define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len, bufsz) \ do { \ @@ -1452,15 +1397,21 @@ _psmi_is_gdr_copy_enabled()) if (ghb->host_buf == NULL && bufsz) { \ PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz); \ } \ - PSM3_ONEAPI_ZE_CREATE_COMMAND_QUEUE_AND_LIST( \ - proto, ghb, len, proto->cq_send); \ + if (! ghb->command_list) { \ + psmi_oneapi_async_cmd_create(ctxt, \ + &proto->cq_send, &ghb->command_list);\ + } \ PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, \ ghb->command_list, \ ghb->host_buf, ghb->gpu_buf, len, \ ghb->copy_status, 0, NULL); \ - PSM3_ONEAPI_ZE_CLOSE_COMMAND_LIST(ghb->command_list); \ - PSM3_ONEAPI_ZE_EXECUTE_COMMAND_LIST( \ - proto->cq_send, ghb->command_list, NULL); \ + if (! psm3_oneapi_immed_async_copy) { \ + PSMI_ONEAPI_ZE_CALL(zeCommandListClose, \ + ghb->command_list); \ + PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,\ + proto->cq_send, 1, \ + &ghb->command_list, NULL); \ + } \ } while (0) #define PSM3_GPU_MEMCPY_DONE(ghb) \ _psm3_oneapi_ze_memcpy_done(ghb) @@ -1486,27 +1437,20 @@ _psmi_is_gdr_copy_enabled()) .wait = ZE_EVENT_SCOPE_FLAG_HOST, \ .index = 0 \ }; \ - struct ze_dev_ctxt *ctxt; \ - \ - ctxt = psmi_oneapi_dev_ctxt_get(ghb->gpu_buf); \ - if (!ctxt) \ - psm3_handle_error(PSMI_EP_NORETURN, \ - PSM2_INTERNAL_ERR, \ - "%s F_INIT: no dev ctxt\n", \ - __FUNCTION__); \ PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate, \ ze_context, &pool_desc, 0, NULL, \ &ghb->event_pool); \ PSMI_ONEAPI_ZE_CALL(zeEventCreate, \ ghb->event_pool, &event_desc, \ &ghb->copy_status); \ - PSM3_ONEAPI_ZE_CREATE_COMMAND_LIST( \ - ze_context, ctxt->dev, &ghb->command_list); \ PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz); \ } while (0) #define PSM3_GPU_HOSTBUF_RESET(ghb) \ do { \ - PSM3_ONEAPI_ZE_RESET_COMMAND_LIST(ghb->command_list); \ + if (! psm3_oneapi_immed_async_copy) { \ + PSMI_ONEAPI_ZE_CALL(zeCommandListReset, \ + ghb->command_list); \ + } \ PSMI_ONEAPI_ZE_CALL(zeEventHostReset, \ ghb->copy_status); \ } while (0) @@ -1537,33 +1481,30 @@ _psmi_is_gdr_copy_enabled()) #ifdef PSM3_USE_ONEAPI_MALLOC #define PSM3_GPU_HOST_ALLOC(ret_ptr, size) \ do { \ - *ret_ptr = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);\ + *ret_ptr = psm3_oneapi_ze_host_alloc_malloc(size); \ } while (0) #define PSM3_ONEAPI_ZE_HOST_FREE(ptr) \ - psmi_free(ptr) + psm3_oneapi_ze_host_free_malloc(ptr) // HOST_ALLOC memory treated as CPU memory for Verbs MRs #define PSM3_GPU_ADDR_SEND_MR(mqreq) \ - ( (mqreq)->is_buf_gpu_mem && ! (mqreq)->gpu_hostbuf_used )) + ( (mqreq)->is_buf_gpu_mem && ! (mqreq)->gpu_hostbuf_used ) #define PSM3_GPU_ADDR_RECV_MR(tidrecvc, mqreq) \ ( (tidrecvc)->is_ptr_gpu_backed ) #else /* PSM3_USE_ONEAPI_MALLOC */ #define PSM3_GPU_HOST_ALLOC(ret_ptr, size) \ do { \ - ze_host_mem_alloc_desc_t host_desc = { \ - .stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC, \ - .flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW \ - }; \ - PSMI_ONEAPI_ZE_CALL(zeMemAllocHost, ze_context, \ - &host_desc, size, 8, (void **)(ret_ptr)); \ + *ret_ptr = (*psm3_oneapi_ze_host_alloc)(size); \ } while (0) #define PSM3_ONEAPI_ZE_HOST_FREE(ptr) \ - PSMI_ONEAPI_ZE_CALL(zeMemFree, ze_context, ptr) + (*psm3_oneapi_ze_host_free)(ptr) // HOST_ALLOC memory treated as GPU memory for Verbs MRs -// Note: no need to "|| gpu_hostbuf_used" since only set if is_buf_gpu_mem +// Note: gpu_hostbuf_used" only set if is_buf_gpu_mem #define PSM3_GPU_ADDR_SEND_MR(mqreq) \ - ( (mqreq)->is_buf_gpu_mem ) + ( (mqreq)->is_buf_gpu_mem && \ + (! (mqreq)->gpu_hostbuf_used || psm3_oneapi_ze_using_zemem_alloc )) #define PSM3_GPU_ADDR_RECV_MR(tidrecvc, mqreq) \ - ( (tidrecvc)->is_ptr_gpu_backed || (mqreq)->gpu_hostbuf_used ) + ( (tidrecvc)->is_ptr_gpu_backed \ + || ((mqreq)->gpu_hostbuf_used && psm3_oneapi_ze_using_zemem_alloc)) #endif /* PSM3_USE_ONEAPI_MALLOC */ #define PSM3_MARK_BUF_SYNCHRONOUS(buf) do { /* not needed for OneAPI ZE */ } while (0) #define PSM3_GPU_MEMCPY_DTOH(dstptr, srcptr, len) \ diff --git a/psm3/ptl_ips/ips_expected_proto.h b/psm3/ptl_ips/ips_expected_proto.h index fb3da99..6e9b94f 100644 --- a/psm3/ptl_ips/ips_expected_proto.h +++ b/psm3/ptl_ips/ips_expected_proto.h @@ -136,8 +136,8 @@ struct ips_protoexp { #endif #ifdef PSM_CUDA CUstream cudastream_recv; -#elif defined(PSM_ONEAPI) && ! defined(PSM3_USE_ONEAPI_IMMEDIATE) - ze_command_queue_handle_t cq_recv; +#elif defined(PSM_ONEAPI) + ze_command_queue_handle_t cq_recv; // NULL if psm3_oneapi_immed_async_copy #endif }; diff --git a/psm3/ptl_ips/ips_proto.h b/psm3/ptl_ips/ips_proto.h index 05737cc..eccd6ce 100644 --- a/psm3/ptl_ips/ips_proto.h +++ b/psm3/ptl_ips/ips_proto.h @@ -436,8 +436,8 @@ struct ips_proto { #ifdef PSM_CUDA CUstream cudastream_send; -#elif defined(PSM_ONEAPI) && ! defined(PSM3_USE_ONEAPI_IMMEDIATE) - ze_command_queue_handle_t cq_send; +#elif defined(PSM_ONEAPI) + ze_command_queue_handle_t cq_send; // NULL if psm3_oneapi_immed_async_copy #endif #if defined(PSM_CUDA) || defined(PSM_ONEAPI)