From 213172a71dd65addd329f643ac8dea9078e33937 Mon Sep 17 00:00:00 2001 From: Eric Raut Date: Tue, 20 Feb 2024 22:33:27 +0000 Subject: [PATCH] rdma: move idpool init to endpoint refcnt+ stage The idpool is freed when refcnt goes to zero, so it should be initialized again when refcnt increases. Signed-off-by: Eric Raut --- src/nccl_ofi_rdma.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c index 6fdc53875..3acfb9c70 100644 --- a/src/nccl_ofi_rdma.c +++ b/src/nccl_ofi_rdma.c @@ -5570,22 +5570,6 @@ static int get_ep(nccl_net_ofi_device_t *base_dev, /* Initialize number of rail */ ep->num_rails = num_rails; - /* Initialize endpoint ID pool */ - ep->comm_idpool = malloc(sizeof(nccl_ofi_idpool_t)); - if (OFI_UNLIKELY(ep->comm_idpool == NULL)) { - ret = ncclSystemError; - NCCL_OFI_WARN("Unable to allocate rdma endpoint ID pool"); - goto unlock; - } - - ret = nccl_ofi_idpool_init(ep->comm_idpool, device->num_comm_ids); - if (OFI_UNLIKELY(ret != 0)) { - ret = ncclSystemError; - free(ep->comm_idpool); - ep->comm_idpool = NULL; - goto unlock; - } - /* Initialize reference count */ ep->ref_cnt = 0; @@ -5625,6 +5609,22 @@ static int get_ep(nccl_net_ofi_device_t *base_dev, goto unlock; } + /* Initialize endpoint ID pool */ + ep->comm_idpool = malloc(sizeof(nccl_ofi_idpool_t)); + if (OFI_UNLIKELY(ep->comm_idpool == NULL)) { + ret = ncclSystemError; + NCCL_OFI_WARN("Unable to allocate rdma endpoint ID pool"); + goto unlock; + } + + ret = nccl_ofi_idpool_init(ep->comm_idpool, device->num_comm_ids); + if (OFI_UNLIKELY(ret != 0)) { + ret = ncclSystemError; + free(ep->comm_idpool); + ep->comm_idpool = NULL; + goto unlock; + } + ret = init_rail_ofi_resources(device, ep); if (ret != 0) { goto unlock;