Skip to content

Commit

Permalink
hmem/cuda: Add dmabuf fd ops functions
Browse files Browse the repository at this point in the history
Implement the get_dmabuf_fd API for cuda interface.

Signed-off-by: Shi Jin <[email protected]>
  • Loading branch information
shijin-aws committed Nov 2, 2023
1 parent 3c9ec83 commit c1b2f4a
Show file tree
Hide file tree
Showing 4 changed files with 181 additions and 2 deletions.
19 changes: 19 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,7 @@ AC_ARG_WITH([cuda],

have_cuda=0
cuda_dlopen=0
have_cuda_dmabuf=0
AC_ARG_ENABLE([cuda-dlopen],
[AS_HELP_STRING([--enable-cuda-dlopen],
[Enable dlopen of CUDA libraries @<:@default=no@:>@])
Expand Down Expand Up @@ -657,6 +658,22 @@ AS_IF([test x"$with_cuda" != x"no"],
[],
[have_cuda=1])
])
have_cuda_dmabuf=1
AC_CHECK_DECL([cuMemGetHandleForAddressRange],
[],
[have_cuda_dmabuf=0],
[[#include <cuda.h>]])
AC_CHECK_DECL([CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED],
[],
[have_cuda_dmabuf=0],
[[#include <cuda.h>]])
AC_CHECK_DECL([CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD],
[],
[have_cuda_dmabuf=0],
[[#include <cuda.h>]])
])

AC_DEFINE_UNQUOTED([ENABLE_CUDA_DLOPEN], [$cuda_dlopen], [dlopen CUDA libraries])
Expand All @@ -666,6 +683,8 @@ AS_IF([test x"$with_cuda" != x"no" && test -n "$with_cuda" && test "$have_cuda"

AC_DEFINE_UNQUOTED([HAVE_CUDA], [$have_cuda], [CUDA support])

AC_DEFINE_UNQUOTED([HAVE_CUDA_DMABUF], [$have_cuda_dmabuf], [CUDA dmabuf support])

AS_IF([test "$cuda_dlopen" != "1"], [LIBS="$LIBS $cuda_LIBS"])
AS_IF([test "$have_cuda" = "1" && test x"$with_cuda" != x"yes"],
[CPPFLAGS="$CPPFLAGS $cuda_CPPFLAGS"
Expand Down
3 changes: 3 additions & 0 deletions include/ofi_hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,9 @@ int cuda_dev_reg_copy_from_hmem(uint64_t handle, void *dest, const void *src,
bool cuda_is_ipc_enabled(void);
int cuda_get_ipc_handle_size(size_t *size);
bool cuda_is_gdrcopy_enabled(void);
bool cuda_is_dmabuf_supported(void);
int cuda_get_dmabuf_fd(void *addr, uint64_t size, int *fd,
uint64_t *offset);

void cuda_gdrcopy_to_dev(uint64_t handle, void *dev,
const void *host, size_t size);
Expand Down
2 changes: 1 addition & 1 deletion src/hmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_unregister = cuda_dev_unregister,
.dev_reg_copy_to_hmem = cuda_dev_reg_copy_to_hmem,
.dev_reg_copy_from_hmem = cuda_dev_reg_copy_from_hmem,
.get_dmabuf_fd = ofi_hmem_no_get_dmabuf_fd,
.get_dmabuf_fd = cuda_get_dmabuf_fd,
},
[FI_HMEM_ROCR] = {
.initialized = false,
Expand Down
159 changes: 158 additions & 1 deletion src/hmem_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

#include "ofi_hmem.h"
#include "ofi.h"
#include "ofi_mem.h"

#if HAVE_CUDA

Expand All @@ -52,13 +53,27 @@
* Convenience higher-order macros for enumerating CUDA driver/runtime and
* NVML API function names
*/

#if HAVE_CUDA_DMABUF
#define CUDA_DRIVER_FUNCS_DEF(_) \
_(cuGetErrorName) \
_(cuGetErrorString) \
_(cuPointerGetAttribute) \
_(cuPointerSetAttribute) \
_(cuDeviceCanAccessPeer) \
_(cuMemGetAddressRange) \
_(cuMemGetHandleForAddressRange) \
_(cuDeviceGetAttribute) \
_(cuDeviceGet)
#else
#define CUDA_DRIVER_FUNCS_DEF(_) \
_(cuGetErrorName) \
_(cuGetErrorString) \
_(cuPointerGetAttribute) \
_(cuPointerSetAttribute) \
_(cuDeviceCanAccessPeer) \
_(cuMemGetAddressRange)
#endif /* HAVE_CUDA_DMABUF */

#define CUDA_RUNTIME_FUNCS_DEF(_) \
_(cudaMemcpy) \
Expand Down Expand Up @@ -86,6 +101,7 @@ static struct {
bool p2p_access_supported;
bool use_gdrcopy;
bool use_ipc;
bool dmabuf_supported;
void *driver_handle;
void *runtime_handle;
void *nvml_handle;
Expand All @@ -96,7 +112,8 @@ static struct {
.use_ipc = false,
.driver_handle = NULL,
.runtime_handle = NULL,
.nvml_handle = NULL
.nvml_handle = NULL,
.dmabuf_supported = false
};

static struct {
Expand All @@ -119,6 +136,15 @@ static struct {
size_t* psize, CUdeviceptr dptr);
CUresult (*cuDeviceCanAccessPeer)(int *canAccessPeer,
CUdevice srcDevice, CUdevice dstDevice);
#if HAVE_CUDA_DMABUF
CUresult (*cuMemGetHandleForAddressRange)(void* handle,
CUdeviceptr dptr, size_t size,
CUmemRangeHandleType handleType,
unsigned long long flags);
CUresult (*cuDeviceGetAttribute)(int* pi,
CUdevice_attribute attrib, CUdevice dev);
CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
#endif /* HAVE_CUDA_DMABUF */
cudaError_t (*cudaHostRegister)(void *ptr, size_t size,
unsigned int flags);
cudaError_t (*cudaHostUnregister)(void *ptr);
Expand Down Expand Up @@ -199,6 +225,18 @@ CUresult ofi_cuPointerGetAttribute(void *data, CUpointer_attribute attribute,
return cuda_ops.cuPointerGetAttribute(data, attribute, ptr);
}

#define CUDA_DRIVER_LOG_ERR(cu_result, cuda_api_name) \
{ \
const char *cu_error_name; \
const char *cu_error_str; \
cuda_ops.cuGetErrorName(cu_result, &cu_error_name); \
cuda_ops.cuGetErrorString(cu_result, &cu_error_str); \
FI_WARN(&core_prov, FI_LOG_CORE, "%s failed: %s:%s\n", \
cuda_api_name, cu_error_name, cu_error_str); \
free((void *) cu_error_name); \
free((void *) cu_error_str); \
}

/**
* @brief Set CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for a cuda ptr
* to ensure any synchronous copies are completed prior
Expand Down Expand Up @@ -613,6 +651,105 @@ static int cuda_hmem_detect_p2p_access_support(void)
return FI_SUCCESS;
}

/**
* @brief detect dmabuf support in the current platform
* This checks the dmabuf support in the current platform
* by querying the property of cuda device 0
*
* @return FI_SUCCESS if dmabuf support check is successful
* -FI_EIO upon CUDA API error
*/
static int cuda_hmem_detect_dmabuf_support(void)
{
#if HAVE_CUDA_DMABUF
CUresult cuda_ret;
CUdevice dev;
int is_supported = 0;

cuda_ret = cuda_ops.cuDeviceGet(&dev, 0);
if (cuda_ret != CUDA_SUCCESS) {
CUDA_DRIVER_LOG_ERR(cuda_ret, "cuDeviceGet");
return -FI_EIO;
}

cuda_ret = cuda_ops.cuDeviceGetAttribute(&is_supported,
CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev);
if (cuda_ret != CUDA_SUCCESS) {
CUDA_DRIVER_LOG_ERR(cuda_ret, "cuDeviceGetAttribute");
return -FI_EIO;
}

FI_INFO(&core_prov, FI_LOG_CORE,
"cuda dmabuf support status: %d\n", is_supported);

cuda_attr.dmabuf_supported = (is_supported == 1);
#endif /* HAVE_CUDA_DMABUF */

return FI_SUCCESS;
}

/**
* @brief Get dmabuf fd and offset for a given cuda memory allocation
*
* @param addr the starting address of the cuda memory allocation
* @param size the length of the cuda memory allocation
* @param fd the fd of the dmabuf region
* @param offset the offset of the buf in the dmabuf region
* @return FI_SUCCESS if dmabuf fd and offset are retrieved successfully
* -FI_EOPNOTSUPP if dmabuf is not supported on the cuda device
* -FI_EIO upon CUDA API error
*/
int cuda_get_dmabuf_fd(void *addr, uint64_t size, int *fd,
uint64_t *offset)
{
#if HAVE_CUDA_DMABUF
CUdeviceptr aligned_ptr;
CUresult cuda_ret;
int ret;

size_t aligned_size;
size_t host_page_size = ofi_get_page_size();
void *base_addr;
size_t total_size;

if (!cuda_is_dmabuf_supported())
return -FI_EOPNOTSUPP;

ret = cuda_get_base_addr(addr, size, &base_addr, &total_size);
if (ret)
return ret;

aligned_ptr = (uintptr_t) ofi_get_page_start(base_addr, host_page_size);
aligned_size = (uintptr_t) ofi_get_page_end((void *) ((uintptr_t) base_addr + total_size),
host_page_size) - (uintptr_t) aligned_ptr + 1;

cuda_ret = cuda_ops.cuMemGetHandleForAddressRange(
(void *)fd,
aligned_ptr, aligned_size,
CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
0);
if (cuda_ret != CUDA_SUCCESS) {
CUDA_DRIVER_LOG_ERR(cuda_ret, "cuMemGetHandleForAddressRange");
return -FI_EIO;
}

*offset = (uintptr_t) addr - (uintptr_t) aligned_ptr;

FI_INFO(&core_prov, FI_LOG_CORE,
"Get dma buf handle with fd: %d, offset: %lu"
", page aligned base address: %p"
", page aligned size: %lu, cuda allocation address %p"
", cuda allocation length: %lu\n",
*fd, *offset,
(void *) aligned_ptr, aligned_size,
(void *) addr, size);

return FI_SUCCESS;
#else
return -FI_EOPNOTSUPP;
#endif /* HAVE_CUDA_DMABUF */
}

int cuda_hmem_init(void)
{
int ret;
Expand All @@ -635,6 +772,10 @@ int cuda_hmem_init(void)
if (ret != FI_SUCCESS)
goto dl_cleanup;

ret = cuda_hmem_detect_dmabuf_support();
if (ret != FI_SUCCESS)
goto dl_cleanup;

ret = 1;
fi_param_get_bool(NULL, "hmem_cuda_use_gdrcopy",
&ret);
Expand Down Expand Up @@ -783,6 +924,11 @@ bool cuda_is_gdrcopy_enabled(void)
return cuda_attr.use_gdrcopy;
}

bool cuda_is_dmabuf_supported(void)
{
return cuda_attr.dmabuf_supported;
}

#else

int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size)
Expand Down Expand Up @@ -878,6 +1024,17 @@ bool cuda_is_gdrcopy_enabled(void)
return false;
}

bool cuda_is_dmabuf_supported(void)
{
return false;
}

int cuda_get_dmabuf_fd(void *addr, uint64_t size, int *fd,
uint64_t *offset)
{
return -FI_ENOSYS;
}

int cuda_set_sync_memops(void *ptr)
{
return FI_SUCCESS;
Expand Down

0 comments on commit c1b2f4a

Please sign in to comment.