hmem/cuda: Add dmabuf fd ops functions

Implement the get_dmabuf_fd API for cuda interface. Signed-off-by: Shi Jin <[email protected]>
shijin-aws · Oct 30, 2023 · 8e1e20c · 8e1e20c
1 parent 95874ad
commit 8e1e20c
Show file tree

Hide file tree

Showing 3 changed files with 135 additions and 3 deletions.
diff --git a/include/ofi_hmem.h b/include/ofi_hmem.h
@@ -191,6 +191,9 @@ int cuda_dev_reg_copy_from_hmem(uint64_t handle, void *dest, const void *src,
 bool cuda_is_ipc_enabled(void);
 int cuda_get_ipc_handle_size(size_t *size);
 bool cuda_is_gdrcopy_enabled(void);
+bool cuda_is_dmabuf_supported(void);
+int cuda_get_dmabuf_fd(void *addr, uint64_t size, int *fd,
+			uint64_t *offset);
 
 void cuda_gdrcopy_to_dev(uint64_t handle, void *dev,
 			 const void *host, size_t size);

diff --git a/src/hmem.c b/src/hmem.c
@@ -166,7 +166,7 @@ struct ofi_hmem_ops hmem_ops[] = {
 		.dev_unregister = cuda_dev_unregister,
 		.dev_reg_copy_to_hmem = cuda_dev_reg_copy_to_hmem,
 		.dev_reg_copy_from_hmem = cuda_dev_reg_copy_from_hmem,
-		.get_dmabuf_fd = ofi_hmem_no_get_dmabuf_fd,
+		.get_dmabuf_fd = cuda_get_dmabuf_fd,
 	},
 	[FI_HMEM_ROCR] = {
 		.initialized = false,

diff --git a/src/hmem_cuda.c b/src/hmem_cuda.c
@@ -37,6 +37,7 @@
 
 #include "ofi_hmem.h"
 #include "ofi.h"
+#include "ofi_mem.h"
 
 #if HAVE_CUDA
 
@@ -58,7 +59,10 @@
 	_(cuPointerGetAttribute)	\
 	_(cuPointerSetAttribute)	\
 	_(cuDeviceCanAccessPeer)	\
-	_(cuMemGetAddressRange)
+	_(cuMemGetAddressRange)		\
+	_(cuMemGetHandleForAddressRange) \
+	_(cuDeviceGetAttribute)		\
+	_(cuDeviceGet)
 
 #define CUDA_RUNTIME_FUNCS_DEF(_)	\
 	_(cudaMemcpy)			\
@@ -86,6 +90,7 @@ static struct {
 	bool  p2p_access_supported;
 	bool  use_gdrcopy;
 	bool  use_ipc;
+	bool  dmabuf_supported;
 	void *driver_handle;
 	void *runtime_handle;
 	void *nvml_handle;
@@ -96,7 +101,8 @@ static struct {
 	.use_ipc              = false,
 	.driver_handle        = NULL,
 	.runtime_handle       = NULL,
-	.nvml_handle          = NULL
+	.nvml_handle          = NULL,
+	.dmabuf_supported     = false
 };
 
 static struct {
@@ -119,6 +125,13 @@ static struct {
 					  size_t* psize, CUdeviceptr dptr);
 	CUresult (*cuDeviceCanAccessPeer)(int *canAccessPeer,
 					  CUdevice srcDevice, CUdevice dstDevice);
+	CUresult (*cuMemGetHandleForAddressRange)(void* handle,
+						  CUdeviceptr dptr, size_t size,
+						  CUmemRangeHandleType handleType,
+						  unsigned long long flags);
+	CUresult (*cuDeviceGetAttribute)(int* pi,
+					 CUdevice_attribute attrib, CUdevice dev);
+	CUresult (*cuDeviceGet)(CUdevice* device, int ordinal);
 	cudaError_t (*cudaHostRegister)(void *ptr, size_t size,
 					unsigned int flags);
 	cudaError_t (*cudaHostUnregister)(void *ptr);
@@ -199,6 +212,16 @@ CUresult ofi_cuPointerGetAttribute(void *data, CUpointer_attribute attribute,
 	return cuda_ops.cuPointerGetAttribute(data, attribute, ptr);
 }
 
+#define CUDA_DRIVER_LOG_ERR(cu_result, cuda_api_name) 			\
+{									\
+	const char *cu_error_name;					\
+	const char *cu_error_str;					\
+	cuda_ops.cuGetErrorName(cu_result, &cu_error_name);		\
+	cuda_ops.cuGetErrorString(cu_result, &cu_error_str);		\
+	FI_WARN(&core_prov, FI_LOG_CORE, "%s failed: %s:%s\n",		\
+		cuda_api_name, cu_error_name, cu_error_str);		\
+}
+
 /**
  * @brief Set CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for a cuda ptr
  * to ensure any synchronous copies are completed prior
@@ -613,6 +636,92 @@ static int cuda_hmem_detect_p2p_access_support(void)
 	return FI_SUCCESS;
 }
 
+/**
+ * @brief detect dmabuf support in the current platform
+ * This checks the dmabuf support in the current platform
+ * by querying the property of cuda device 0
+ *
+ * @return  FI_SUCCESS if dmabuf support check is successful
+ *         -FI_EIO upon CUDA API error
+ */
+static int cuda_hmem_detect_dmabuf_support(void)
+{
+	CUresult cuda_ret;
+	CUdevice dev;
+	int is_supported = 0;
+
+	if (cuda_attr.device_count <= 1)
+		return FI_SUCCESS;
+
+	cuda_ret = cuda_ops.cuDeviceGet(&dev, 0);
+	if (cuda_ret != CUDA_SUCCESS) {
+		CUDA_DRIVER_LOG_ERR(cuda_ret, "cuDeviceGet");
+		return -FI_EIO;
+	}
+
+	cuda_ret = cuda_ops.cuDeviceGetAttribute(&is_supported,
+				CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, dev);
+	if (cuda_ret != CUDA_SUCCESS) {
+		CUDA_DRIVER_LOG_ERR(cuda_ret, "cuDeviceGetAttribute");
+		return -FI_EIO;
+	}
+
+	FI_INFO(&core_prov, FI_LOG_CORE,
+		"cuda dmabuf support status: %d\n", is_supported);
+	cuda_attr.dmabuf_supported = (is_supported == 1);
+	return FI_SUCCESS;
+}
+
+/**
+ * @brief Get dmabuf fd and offset for a given cuda memory allocation
+ *
+ * @param addr the starting address of the cuda memory allocation
+ * @param size the length of the cuda memory allocation
+ * @param fd the fd of the dmabuf region
+ * @param offset the offset of the buf in the dmabuf region
+ * @return  FI_SUCCESS if dmabuf fd and offset are retrieved successfully
+ *         -FI_EOPNOTSUPP if dmabuf is not supported on the cuda device
+ *         -FI_EIO upon CUDA API error
+ */
+int cuda_get_dmabuf_fd(void *addr, uint64_t size, int *fd,
+			uint64_t *offset)
+{
+	CUdeviceptr aligned_ptr;
+	CUresult cuda_ret;
+
+	size_t aligned_size;
+	size_t host_page_size = ofi_get_page_size();
+
+	if (!cuda_is_dmabuf_supported())
+		return -FI_EOPNOTSUPP;
+
+	aligned_ptr = (uintptr_t) ofi_get_page_start(addr, host_page_size);
+	aligned_size = (uintptr_t) ofi_get_page_end((void *) ((uintptr_t) addr + size),
+						    host_page_size) - (uintptr_t) aligned_ptr + 1;
+
+	cuda_ret = cuda_ops.cuMemGetHandleForAddressRange(
+						(void *)fd,
+						aligned_ptr, aligned_size,
+						CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+						0);
+	if (cuda_ret != CUDA_SUCCESS) {
+		CUDA_DRIVER_LOG_ERR(cuda_ret, "cuMemGetHandleForAddressRange");
+		return -FI_EIO;
+	}
+
+	*offset = (uintptr_t) addr - (uintptr_t) aligned_ptr;
+
+	FI_INFO(&core_prov, FI_LOG_CORE,
+		"Get dma buf handle with fd: %d, offset: %lu"
+		", page aligned base address: %p"
+		", page aligned size: %lu, cuda allocation address %p"
+		", cuda allocation length: %lu\n",
+		*fd, *offset,
+		(void *) aligned_ptr, aligned_size,
+		(void *) addr, size);
+	return FI_SUCCESS;
+}
+
 int cuda_hmem_init(void)
 {
 	int ret;
@@ -635,6 +744,10 @@ int cuda_hmem_init(void)
 	if (ret != FI_SUCCESS)
 		goto dl_cleanup;
 
+	ret = cuda_hmem_detect_dmabuf_support();
+	if (ret != FI_SUCCESS)
+		goto dl_cleanup;
+
 	ret = 1;
 	fi_param_get_bool(NULL, "hmem_cuda_use_gdrcopy",
 			  &ret);
@@ -783,6 +896,11 @@ bool cuda_is_gdrcopy_enabled(void)
 	return cuda_attr.use_gdrcopy;
 }
 
+bool cuda_is_dmabuf_supported(void)
+{
+	return cuda_attr.dmabuf_supported;
+}
+
 #else
 
 int cuda_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size)
@@ -878,6 +996,17 @@ bool cuda_is_gdrcopy_enabled(void)
 	return false;
 }
 
+bool cuda_is_dmabuf_supported(void)
+{
+	return false;
+}
+
+int cuda_get_dmabuf_fd(void *addr, uint64_t size, int *fd,
+			uint64_t *offset)
+{
+	return -FI_ENOSYS;
+}
+
 int cuda_set_sync_memops(void *ptr)
 {
         return FI_SUCCESS;