Skip to content

Commit

Permalink
hmem: Add new op to hmem_ops for getting dmabuf fd
Browse files Browse the repository at this point in the history
Now that we have multiple ifaces that support dmabuf, it makes sense to have
a common method to retrieve the fd associated with the dmabuf object.

The verbs and efa providers are updated accordingly.

Signed-off-by: Jianxin Xiong <[email protected]>
  • Loading branch information
j-xiong committed Oct 26, 2023
1 parent 2901c26 commit cf6ad9a
Show file tree
Hide file tree
Showing 8 changed files with 114 additions and 48 deletions.
16 changes: 14 additions & 2 deletions include/ofi_hmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ struct ofi_hmem_ops {
const void *src, size_t size);
int (*dev_reg_copy_from_hmem)(uint64_t handle, void *dest,
const void *src, size_t size);
int (*get_dmabuf_fd)(void *addr, uint64_t size, int *fd,
uint64_t *offset);
};

extern struct ofi_hmem_ops hmem_ops[];
Expand Down Expand Up @@ -233,6 +235,7 @@ int ze_dev_reg_copy_to_hmem(uint64_t handle, void *dest, const void *src,
size_t size);
int ze_dev_reg_copy_from_hmem(uint64_t handle, void *dest, const void *src,
size_t size);
int ze_hmem_get_dmabuf_fd(void *addr, uint64_t size, int *fd, uint64_t *offset);

int neuron_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size);
int neuron_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size);
Expand All @@ -242,15 +245,16 @@ int neuron_hmem_init(void);
int neuron_hmem_cleanup(void);
void *neuron_alloc(void **handle, size_t size);
void neuron_free(void **handle);
int neuron_get_dmabuf_fd(uint64_t va, uint64_t size, int* fd);
int neuron_get_dmabuf_fd(void *addr, uint64_t size, int *fd, uint64_t *offset);

int synapseai_init(void);
int synapseai_cleanup(void);
int synapseai_copy_to_hmem(uint64_t device, void *dest, const void *src,
size_t size);
int synapseai_copy_from_hmem(uint64_t device, void *dest, const void *src,
size_t size);
int synapseai_get_dmabuf_fd(uint64_t addr, uint64_t size, int* fd);
int synapseai_get_dmabuf_fd(void *addr, uint64_t size, int *fd,
uint64_t *offset);
bool synapseai_is_addr_valid(const void *addr, uint64_t *device,
uint64_t *flags);
int synapseai_host_register(void *ptr, size_t size);
Expand Down Expand Up @@ -340,6 +344,12 @@ static inline bool ofi_hmem_no_is_ipc_enabled(void)
return false;
}

static inline int ofi_hmem_no_get_dmabuf_fd(void *addr, uint64_t size, int *fd,
uint64_t *offset)
{
return -FI_ENOSYS;
}

static inline bool ofi_hmem_p2p_disabled(void)
{
return ofi_hmem_disable_p2p;
Expand Down Expand Up @@ -420,5 +430,7 @@ int ofi_hmem_dev_reg_copy_to_hmem(enum fi_hmem_iface iface, uint64_t handle,
void *dest, const void *src, size_t size);
int ofi_hmem_dev_reg_copy_from_hmem(enum fi_hmem_iface iface, uint64_t handle,
void *dest, const void *src, size_t size);
int ofi_hmem_get_dmabuf_fd(enum fi_hmem_iface, void *addr, uint64_t size,
int *fd, uint64_t *offset);

#endif /* _OFI_HMEM_H_ */
5 changes: 3 additions & 2 deletions prov/efa/src/efa_hmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ static int efa_domain_hmem_info_init_neuron(struct efa_domain *efa_domain)
void *ptr = NULL;
size_t len = ofi_get_page_size() * 2, tmp_value;
int dmabuf_fd;
uint64_t offset;
int ret;

if (!ofi_hmem_is_initialized(FI_HMEM_NEURON)) {
Expand Down Expand Up @@ -255,10 +256,10 @@ static int efa_domain_hmem_info_init_neuron(struct efa_domain *efa_domain)
/* Neuron currently requires P2P */
info->p2p_required_by_impl = true;

ret = neuron_get_dmabuf_fd((uint64_t)ptr, (uint64_t)len, &dmabuf_fd);
ret = neuron_get_dmabuf_fd(ptr, (uint64_t)len, &dmabuf_fd, &offset);
if (ret == FI_SUCCESS) {
ibv_mr = ibv_reg_dmabuf_mr(
g_device_list[0].ibv_pd, 0,
g_device_list[0].ibv_pd, offset,
len, (uint64_t)ptr, dmabuf_fd, ibv_access);
} else if (ret == -FI_ENOPROTOOPT) {
EFA_INFO(FI_LOG_MR,
Expand Down
15 changes: 9 additions & 6 deletions prov/efa/src/efa_mr.c
Original file line number Diff line number Diff line change
Expand Up @@ -506,16 +506,17 @@ struct fi_ops efa_mr_ops = {
static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr *mr_attr, int access)
{
int dmabuf_fd;
uint64_t offset;
int ret;
if (efa_mr_is_synapseai(efa_mr)) {
ret = synapseai_get_dmabuf_fd((uint64_t) mr_attr->mr_iov->iov_base,
ret = synapseai_get_dmabuf_fd(mr_attr->mr_iov->iov_base,
(uint64_t) mr_attr->mr_iov->iov_len,
&dmabuf_fd);
&dmabuf_fd, &offset);
if (ret != FI_SUCCESS) {
EFA_WARN(FI_LOG_MR, "Unable to get dmabuf fd for Gaudi device buffer \n");
return NULL;
}
return ibv_reg_dmabuf_mr(efa_mr->domain->ibv_pd, 0,
return ibv_reg_dmabuf_mr(efa_mr->domain->ibv_pd, offset,
mr_attr->mr_iov->iov_len,
(uint64_t)mr_attr->mr_iov->iov_base,
dmabuf_fd, access);
Expand Down Expand Up @@ -544,15 +545,17 @@ static struct ibv_mr *efa_mr_reg_ibv_mr(struct efa_mr *efa_mr, struct fi_mr_attr
}

int dmabuf_fd, ret;
uint64_t offset;

ret = neuron_get_dmabuf_fd(
(uint64_t) mr_attr->mr_iov->iov_base,
mr_attr->mr_iov->iov_base,
(uint64_t) mr_attr->mr_iov->iov_len,
&dmabuf_fd);
&dmabuf_fd, &offset);

if (ret == FI_SUCCESS) {
/* Success => invoke ibv_reg_dmabuf_mr */
return ibv_reg_dmabuf_mr(
efa_mr->domain->ibv_pd, 0,
efa_mr->domain->ibv_pd, offset,
mr_attr->mr_iov->iov_len,
(uint64_t)mr_attr->mr_iov->iov_base,
dmabuf_fd, access);
Expand Down
52 changes: 27 additions & 25 deletions prov/verbs/src/verbs_mr.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,57 +58,59 @@ static struct fi_ops vrb_mr_fi_ops = {
};

#if VERBS_HAVE_DMABUF_MR
static struct ibv_mr *vrb_reg_ze_dmabuf(struct ibv_pd *pd, const void *buf,
size_t len, int vrb_access)
static struct ibv_mr *vrb_reg_hmem_dmabuf(enum fi_hmem_iface iface,
struct ibv_pd *pd, const void *buf,
size_t len, int vrb_access)
{
void *handle;
void *base;
uint64_t offset;
int err;
int fd;
uint64_t offset;
struct ibv_mr *mr;
int saved_errno = 0;
enum { TRY, ALWAYS, NEVER };
static int failover_policy = TRY;

if (failover_policy == ALWAYS)
static int failover_policy[] = {
[FI_HMEM_SYSTEM] = ALWAYS,
[FI_HMEM_CUDA] = TRY,
[FI_HMEM_ROCR] = TRY,
[FI_HMEM_ZE] = TRY,
[FI_HMEM_NEURON] = NEVER,
[FI_HMEM_SYNAPSEAI] = NEVER,
};

if (failover_policy[iface] == ALWAYS)
goto failover;

err = ze_hmem_get_handle((void *)buf, len, &handle);
if (err)
return NULL;

err = ze_hmem_get_base_addr((void *)buf, len, &base, NULL);
err = ofi_hmem_get_dmabuf_fd(iface, (void *)buf, len, &fd, &offset);
if (err)
return NULL;

offset = (uintptr_t)buf - (uintptr_t)base;
mr = ibv_reg_dmabuf_mr(pd, offset, len, (uint64_t)buf/* iova */,
(int)(uintptr_t)handle/* dmabuf fd */,
vrb_access);
if (!mr && failover_policy == TRY && vrb_gl_data.peer_mem_support) {
fd, vrb_access);
if (!mr && failover_policy[iface] == TRY &&
vrb_gl_data.peer_mem_support) {
saved_errno = errno;
goto failover;
}

failover_policy = NEVER;
failover_policy[iface] = NEVER;
return mr;

failover:
mr = ibv_reg_mr(pd, (void *)buf, len, vrb_access);
if (!mr) {
if (saved_errno) {
FI_INFO(&vrb_prov, FI_LOG_MR,
"Failover failed: ibv_reg_mr(%p, %zd) error %d\n",
buf, len, errno);
"Failover failed: ibv_reg_mr(%p, %zd) error %d, iface %d\n",
buf, len, errno, iface);
errno = saved_errno;
}
return NULL;
}

if (failover_policy == TRY) {
failover_policy = ALWAYS;
if (failover_policy[iface] == TRY) {
failover_policy[iface] = ALWAYS;
FI_INFO(&vrb_prov, FI_LOG_MR,
"Failover on: ibv_reg_dmabuf_mr() ==> ibv_reg_mr()\n");
"Failover on: ibv_reg_dmabuf_mr() ==> ibv_reg_mr(), iface %d\n", iface);
}
return mr;
}
Expand Down Expand Up @@ -141,8 +143,8 @@ vrb_mr_reg_common(struct vrb_mem_desc *md, int vrb_access, const void *base_addr
len, (uintptr_t) base_addr + (uintptr_t) buf,
(int) device, vrb_access);
else if (iface == FI_HMEM_ZE && vrb_gl_data.dmabuf_support)
md->mr = vrb_reg_ze_dmabuf(md->domain->pd, buf, len,
vrb_access);
md->mr = vrb_reg_hmem_dmabuf(iface, md->domain->pd, buf, len,
vrb_access);
else
#endif
md->mr = ibv_reg_mr(md->domain->pd, (void *) buf, len,
Expand Down
12 changes: 12 additions & 0 deletions src/hmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_unregister = ofi_hmem_system_dev_unregister,
.dev_reg_copy_to_hmem = ofi_hmem_system_dev_reg_copy,
.dev_reg_copy_from_hmem = ofi_hmem_system_dev_reg_copy,
.get_dmabuf_fd = ofi_hmem_no_get_dmabuf_fd,
},
[FI_HMEM_CUDA] = {
.initialized = false,
Expand All @@ -165,6 +166,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_unregister = cuda_dev_unregister,
.dev_reg_copy_to_hmem = cuda_dev_reg_copy_to_hmem,
.dev_reg_copy_from_hmem = cuda_dev_reg_copy_from_hmem,
.get_dmabuf_fd = ofi_hmem_no_get_dmabuf_fd,
},
[FI_HMEM_ROCR] = {
.initialized = false,
Expand All @@ -190,6 +192,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_unregister = rocr_dev_unregister,
.dev_reg_copy_to_hmem = rocr_dev_reg_copy_to_hmem,
.dev_reg_copy_from_hmem = rocr_dev_reg_copy_from_hmem,
.get_dmabuf_fd = ofi_hmem_no_get_dmabuf_fd,
},
[FI_HMEM_ZE] = {
.initialized = false,
Expand All @@ -215,6 +218,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_unregister = ze_dev_unregister,
.dev_reg_copy_to_hmem = ze_dev_reg_copy_to_hmem,
.dev_reg_copy_from_hmem = ze_dev_reg_copy_from_hmem,
.get_dmabuf_fd = ze_hmem_get_dmabuf_fd,
},
[FI_HMEM_NEURON] = {
.initialized = false,
Expand All @@ -239,6 +243,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_unregister = ofi_hmem_no_dev_unregister,
.dev_reg_copy_to_hmem = ofi_hmem_no_dev_reg_copy_to_hmem,
.dev_reg_copy_from_hmem = ofi_hmem_no_dev_reg_copy_from_hmem,
.get_dmabuf_fd = neuron_get_dmabuf_fd,
},
[FI_HMEM_SYNAPSEAI] = {
.initialized = false,
Expand All @@ -263,6 +268,7 @@ struct ofi_hmem_ops hmem_ops[] = {
.dev_unregister = ofi_hmem_no_dev_unregister,
.dev_reg_copy_to_hmem = ofi_hmem_no_dev_reg_copy_to_hmem,
.dev_reg_copy_from_hmem = ofi_hmem_no_dev_reg_copy_from_hmem,
.get_dmabuf_fd = synapseai_get_dmabuf_fd,
},
};

Expand Down Expand Up @@ -695,3 +701,9 @@ int ofi_hmem_dev_reg_copy_from_hmem(enum fi_hmem_iface iface, uint64_t handle,
{
return hmem_ops[iface].dev_reg_copy_from_hmem(handle, dest, src, size);
}

int ofi_hmem_get_dmabuf_fd(enum fi_hmem_iface iface, void *addr, uint64_t size,
int *fd, uint64_t *offset)
{
return hmem_ops[iface].get_dmabuf_fd(addr, size, fd, offset);
}
16 changes: 13 additions & 3 deletions src/hmem_neuron.c
Original file line number Diff line number Diff line change
Expand Up @@ -212,19 +212,28 @@ void neuron_free(void **handle)
* @param addr[in] the device buffer address
* @param size[in] the device buffer size (in bytes)
* @param fd[out] the dma-buf fd
* @param offset[out] the offset within the dma-buf object
* @return int On success, return 0. On failure, return a negative error code
*/
int neuron_get_dmabuf_fd(uint64_t va, uint64_t size, int* fd) {
int neuron_get_dmabuf_fd(void *addr, uint64_t size, int *fd, uint64_t *offset)
{
NRT_STATUS ret;

/* nrt_get_dmabuf_fd symbol doesn't exist in Neuron Runtime */
if (!neuron_ops.nrt_get_dmabuf_fd) {
return -FI_ENOPROTOOPT;
}

ret = neuron_ops.nrt_get_dmabuf_fd(va, size, fd);
ret = neuron_ops.nrt_get_dmabuf_fd((uintptr_t)addr, size, fd);

if (ret == NRT_SUCCESS) {
/*
* The assumption is that nrt_get_dmabuf_fd() would fail for
* any addr that is not the starting address of the dma-buf
* object. Otherwise we need a low level op to get the base
* address of the dma-buf object.
*/
*offset = 0;
return FI_SUCCESS;
} else if (ret == NRT_RESOURCE) {
/* real error from Neuron */
Expand Down Expand Up @@ -281,7 +290,8 @@ void neuron_free(void **handle)
return;
}

int neuron_get_dmabuf_fd(uint64_t va, uint64_t size, int* fd) {
int neuron_get_dmabuf_fd(void *addr, uint64_t size, int *fd, uint64_t *offset)
{
return -FI_ENOSYS;
}

Expand Down
17 changes: 14 additions & 3 deletions src/hmem_synapseai.c
Original file line number Diff line number Diff line change
Expand Up @@ -121,17 +121,27 @@ int synapseai_host_unregister(void *ptr)
* @param addr[in] the device buffer address
* @param size[in] the device buffer size (in bytes)
* @param fd[out] the dma-buf fd
* @param offset[out] the offset within the dma-buf object
* @return int On success, return 0. On failure, return a negative error code.
*/
int synapseai_get_dmabuf_fd(uint64_t addr, uint64_t size, int* fd)
int synapseai_get_dmabuf_fd(void *addr, uint64_t size, int *fd,
uint64_t *offset)
{
int ret;
ret = synapseai_ops.hcclLookupDMABuff(addr, size, fd);
ret = synapseai_ops.hcclLookupDMABuff((uintptr_t)addr, size, fd);
if (*fd < 0) {
FI_WARN(&core_prov, FI_LOG_CORE,
"hcclLookupDMABuff failed, ret: %d\n", ret);
return -FI_EIO;
}

/*
* The assumption is that hcclLookupDMABuff() would fail for any addr
* that is not the starting address of the dma-buf object. Otherwise we
* need a low level op to get the base address of the dma-buf object.
*/
*offset = 0;

return FI_SUCCESS;
}

Expand Down Expand Up @@ -174,7 +184,8 @@ int synapseai_host_unregister(void *ptr)
return -FI_ENOSYS;
}

int synapseai_get_dmabuf_fd(uint64_t addr, uint64_t size, int* fd)
int synapseai_get_dmabuf_fd(void *addr, uint64_t size, int *fd,
uint64_t *offset)
{
return -FI_ENOSYS;
}
Expand Down
Loading

0 comments on commit cf6ad9a

Please sign in to comment.