From 1ee052917a87b9f21ace0d105b6ed51ed8b1ed78 Mon Sep 17 00:00:00 2001
From: Eric Raut <eraut@amazon.com>
Date: Tue, 20 Feb 2024 22:33:27 +0000
Subject: [PATCH 1/3] rdma: move idpool init to endpoint refcnt+ stage

The idpool is freed when refcnt goes to zero, so it should be
initialized again when refcnt increases.

Signed-off-by: Eric Raut <eraut@amazon.com>
(cherry picked from commit 213172a71dd65addd329f643ac8dea9078e33937)
---
 src/nccl_ofi_rdma.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c
index 6fdc53875..3acfb9c70 100644
--- a/src/nccl_ofi_rdma.c
+++ b/src/nccl_ofi_rdma.c
@@ -5570,22 +5570,6 @@ static int get_ep(nccl_net_ofi_device_t *base_dev,
 		/* Initialize number of rail */
 		ep->num_rails = num_rails;
 
-		/* Initialize endpoint ID pool */
-		ep->comm_idpool = malloc(sizeof(nccl_ofi_idpool_t));
-		if (OFI_UNLIKELY(ep->comm_idpool == NULL)) {
-			ret = ncclSystemError;
-			NCCL_OFI_WARN("Unable to allocate rdma endpoint ID pool");
-			goto unlock;
-		}
-
-		ret = nccl_ofi_idpool_init(ep->comm_idpool, device->num_comm_ids);
-		if (OFI_UNLIKELY(ret != 0)) {
-			ret = ncclSystemError;
-			free(ep->comm_idpool);
-			ep->comm_idpool = NULL;
-			goto unlock;
-		}
-
 		/* Initialize reference count */
 		ep->ref_cnt = 0;
 
@@ -5625,6 +5609,22 @@ static int get_ep(nccl_net_ofi_device_t *base_dev,
 			goto unlock;
 		}
 
+		/* Initialize endpoint ID pool */
+		ep->comm_idpool = malloc(sizeof(nccl_ofi_idpool_t));
+		if (OFI_UNLIKELY(ep->comm_idpool == NULL)) {
+			ret = ncclSystemError;
+			NCCL_OFI_WARN("Unable to allocate rdma endpoint ID pool");
+			goto unlock;
+		}
+
+		ret = nccl_ofi_idpool_init(ep->comm_idpool, device->num_comm_ids);
+		if (OFI_UNLIKELY(ret != 0)) {
+			ret = ncclSystemError;
+			free(ep->comm_idpool);
+			ep->comm_idpool = NULL;
+			goto unlock;
+		}
+
 		ret = init_rail_ofi_resources(device, ep);
 		if (ret != 0) {
 			goto unlock;

From ee476d21918dc3e70e6736976833b0a0482aac34 Mon Sep 17 00:00:00 2001
From: Raghu Raja <raghunch@amazon.com>
Date: Thu, 22 Feb 2024 07:27:22 +0000
Subject: [PATCH 2/3] ci: Add quick distribution checks to CI

During the 1.8.0-aws release, we realized the distribution generation
had been broken for a while. Adding quick sanity checks as a Github
Action to prevent this from happening in the future.

Runs to completion in 2 minutes.

Signed-off-by: Raghu Raja <raghunch@amazon.com>
(cherry picked from commit e2f795d29f7ebc08ef6ec3e4406beebd225c8042)
---
 .github/workflows/distcheck.yaml | 58 ++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 .github/workflows/distcheck.yaml

diff --git a/.github/workflows/distcheck.yaml b/.github/workflows/distcheck.yaml
new file mode 100644
index 000000000..d9f1bf203
--- /dev/null
+++ b/.github/workflows/distcheck.yaml
@@ -0,0 +1,58 @@
+name: PR CI
+on: [push, pull_request]
+env:
+  APT_PACKAGES: >-
+    build-essential
+    clang
+    gcc
+    git
+    libhwloc-dev
+    make
+jobs:
+  distcheck:
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        cc:
+          - gcc
+          - clang
+      fail-fast: false
+    steps:
+      - name: Install Dependencies
+        run: |
+          sudo apt-get install -y ${{ env.APT_PACKAGES }}
+      - name: Install CUDA
+        run: |
+          sudo apt-get install -y nvidia-cuda-toolkit
+      - uses: actions/checkout@v4
+      - name: Build and Distribution Checks
+        run: |
+          set -x
+
+          # We're just doing distchecks, so it is fine if we
+          # just grab the latest master.
+          git clone --depth 1 https://github.com/ofiwg/libfabric.git
+          pushd libfabric
+          ./autogen.sh
+          ./configure --prefix=$PWD/install CC=${{ matrix.cc }}
+          make -j $(nproc)
+          make install
+          popd
+
+          # actions/checkout@v4 would drop the plugin source in $PWD,
+          # so go ahead and build it
+          ./autogen.sh
+          ./configure --with-libfabric=$PWD/libfabric/install --with-cuda=/usr/local/cuda/ CC=${{ matrix.cc }}
+          make -j $(nproc)
+
+          # Run Unit tests
+          make check
+
+          # Run dist tarball checks
+          make distcheck
+      - name: Upload build logs
+        if: failure()
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.cc }}-config.log
+          path: config.log

From 2cf8196e1f565d2fa36359fcf4512705b3b94d03 Mon Sep 17 00:00:00 2001
From: Raghu Raja <raghunch@amazon.com>
Date: Wed, 21 Feb 2024 21:48:13 +0000
Subject: [PATCH 3/3] Always propagate completion errors to NCCL

When progressing completions, we could get a completion error for a
request before NCCL gets to calling test() explicitly for that request.
Since NCCL tests for completions in order, this can lead to hangs when
there are non-recoverable failures in the network and NCCL never
receives a successful completion for the earliest request. With this
change, completion errors are always passed up the stack so NCCL can
abort the job and fail gracefully where possible.

This logic can further be enhanced based on provider-specific
information from completion error entry to distinguish between fatal
errors vs recoverable user errors, but that would not be portable.

Fixes #346

Signed-off-by: Raghu Raja <raghunch@amazon.com>
(cherry picked from commit 5aac4dc0f9dc70d31e99be1b14d8b65057afa638)
---
 src/nccl_ofi_api.c  |  37 +++++++
 src/nccl_ofi_rdma.c | 237 ++++++++++++++++++++++++--------------------
 2 files changed, 168 insertions(+), 106 deletions(-)

diff --git a/src/nccl_ofi_api.c b/src/nccl_ofi_api.c
index 3b78f0718..6db98c623 100644
--- a/src/nccl_ofi_api.c
+++ b/src/nccl_ofi_api.c
@@ -24,14 +24,51 @@ ncclDebugLogger_t ofi_log_function = NULL;
 
 static ncclResult_t nccl_net_ofi_retval_translate(int retval)
 {
+	/*
+	 * This translates both ISO C errnos as well as libfabric errnos (up to
+	 * FI_ERRNO_OFFSET they are synonymous).
+	 */
 	switch (retval) {
 	case 0:
 		return ncclSuccess;
 		break;
 	case -EINVAL:
+		/*
+		 * Per ext-net docs, invalid arguments to plugin calls should
+		 * return ncclInternalError. Although a ncclInvalidArgument is defined,
+		 * it is suggested that ext-net plugins not pass these up and
+		 * leave NCCL API argument validation to NCCL.
+		 */
+		return ncclInternalError;
+		break;
+	case -EMSGSIZE:
+		/*
+		 * TODO: Per ext-net docs, this aligns with ncclInvalidUsage,
+		 * which is also defined in NCCL source, but for some reason
+		 * that error type is not available in err.h that we pull from
+		 * ext-net headers upstream. This needs to be fixed once the
+		 * ext-net header gets fixed to include ncclInvalidUsage.
+		 */
 		return ncclInvalidArgument;
 		break;
+	case -ECONNABORTED:
+	case -ECONNRESET:
+	case -ECONNREFUSED:
+	case -ENOTCONN:
+	case -EHOSTDOWN:
+	case -EHOSTUNREACH:
+		/*
+		 * Pass up ncclRemoteError (introduced in NCCL 2.13.4, but
+		 * missing in ext-net documentation) for any unrecoverable peer
+		 * reachability errors.
+		 */
+		return ncclRemoteError;
+		break;
 	default:
+		/*
+		 * Catch-all for other errors, including lifabric-specific error
+		 * codes.
+		 */
 		return ncclSystemError;
 		break;
 	}
diff --git a/src/nccl_ofi_rdma.c b/src/nccl_ofi_rdma.c
index 3acfb9c70..451170e79 100644
--- a/src/nccl_ofi_rdma.c
+++ b/src/nccl_ofi_rdma.c
@@ -1322,6 +1322,71 @@ static inline int handle_write_comp(struct fi_cq_tagged_entry *cq_entry,
 
 static int finish_connect(nccl_net_ofi_rdma_send_comm_t *s_comm);
 
+static const char *req_state_str(nccl_net_ofi_rdma_req_state_t state)
+{
+	switch(state) {
+	case NCCL_OFI_RDMA_REQ_CREATED:
+		return "CREATED";
+	case NCCL_OFI_RDMA_REQ_PENDING:
+		return "PENDING";
+	case NCCL_OFI_RDMA_REQ_COMPLETED:
+		return "COMPLETED";
+	case NCCL_OFI_RDMA_REQ_ERROR:
+		return "ERROR";
+	}
+	return "unknown";
+}
+
+static const char *req_type_str(nccl_net_ofi_rdma_req_type_t type)
+{
+	switch(type) {
+	case NCCL_OFI_RDMA_SEND_CONN:
+		return "SEND_CONN";
+	case NCCL_OFI_RDMA_SEND_CONN_RESP:
+		return "SEND_CONN_RESP";
+	case NCCL_OFI_RDMA_RECV_CONN:
+		return "RECV_CONN";
+	case NCCL_OFI_RDMA_RECV_CONN_RESP:
+		return "RECV_CONN_RESP";
+	case NCCL_OFI_RDMA_SEND:
+		return "SEND";
+	case NCCL_OFI_RDMA_RECV:
+		return "RECV";
+	case NCCL_OFI_RDMA_SEND_CTRL:
+		return "SEND_CTRL";
+	case NCCL_OFI_RDMA_RECV_SEGMS:
+		return "RECV_SEGMS";
+	case NCCL_OFI_RDMA_BOUNCE:
+		return "BOUNCE";
+	case NCCL_OFI_RDMA_FLUSH:
+		return "FLUSH";
+	case NCCL_OFI_RDMA_EAGER_COPY:
+		return "EAGER_COPY";
+	}
+	return "unknown";
+}
+
+/*
+ * @brief	Print NCCL OFI request information
+ */
+static const char *nccl_net_ofi_req_str(nccl_net_ofi_rdma_req_t *req)
+{
+	static char buf[256];
+	snprintf(buf, sizeof(buf), "{ dev: %d, size: %zu, state: %s, type: %s }",
+		 req->dev_id,
+		 req->size,
+		 req_state_str(req->state),
+		 req_type_str(req->type)
+		);
+	return buf;
+}
+
+static int post_rdma_ctrl(nccl_net_ofi_rdma_req_t *req);
+
+static int post_flush_req(nccl_net_ofi_rdma_req_t *req);
+
+static int post_eager_copy(nccl_net_ofi_rdma_req_t *req);
+
 /*
  * @brief	Processes completion entries from CQ
  *
@@ -1450,70 +1515,73 @@ static inline int process_completions(struct fi_cq_tagged_entry *cq_entry,
 	return ret;
 }
 
-static const char *req_state_str(nccl_net_ofi_rdma_req_state_t state)
-{
-	switch(state) {
-	case NCCL_OFI_RDMA_REQ_CREATED:
-		return "CREATED";
-	case NCCL_OFI_RDMA_REQ_PENDING:
-		return "PENDING";
-	case NCCL_OFI_RDMA_REQ_COMPLETED:
-		return "COMPLETED";
-	case NCCL_OFI_RDMA_REQ_ERROR:
-		return "ERROR";
-	}
-	return "unknown";
-}
-
-static const char *req_type_str(nccl_net_ofi_rdma_req_type_t type)
-{
-	switch(type) {
-	case NCCL_OFI_RDMA_SEND_CONN:
-		return "SEND_CONN";
-	case NCCL_OFI_RDMA_SEND_CONN_RESP:
-		return "SEND_CONN_RESP";
-	case NCCL_OFI_RDMA_RECV_CONN:
-		return "RECV_CONN";
-	case NCCL_OFI_RDMA_RECV_CONN_RESP:
-		return "RECV_CONN_RESP";
-	case NCCL_OFI_RDMA_SEND:
-		return "SEND";
-	case NCCL_OFI_RDMA_RECV:
-		return "RECV";
-	case NCCL_OFI_RDMA_SEND_CTRL:
-		return "SEND_CTRL";
-	case NCCL_OFI_RDMA_RECV_SEGMS:
-		return "RECV_SEGMS";
-	case NCCL_OFI_RDMA_BOUNCE:
-		return "BOUNCE";
-	case NCCL_OFI_RDMA_FLUSH:
-		return "FLUSH";
-	case NCCL_OFI_RDMA_EAGER_COPY:
-		return "EAGER_COPY";
-	}
-	return "unknown";
-}
-
 /*
- * @brief	Print NCCL OFI request information
+ * @brief	Process error completion entries from the CQ error queue
+ *
+ * @return	0, on success
+ *		error, on others
  */
-static const char *nccl_net_ofi_req_str(nccl_net_ofi_rdma_req_t *req)
+static inline int process_err_completion(nccl_net_ofi_rdma_ep_t *ep,
+					 int rail_id)
 {
-	static char buf[256];
-	snprintf(buf, sizeof(buf), "{ dev: %d, size: %zu, state: %s, type: %s }",
-		 req->dev_id,
-		 req->size,
-		 req_state_str(req->state),
-		 req_type_str(req->type)
-		);
-	return buf;
-}
+	nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id);
+	struct fi_cq_err_entry err_entry = { 0 };
+	nccl_net_ofi_rdma_req_t *req = NULL;
+	int ret = 0;
 
-static int post_rdma_ctrl(nccl_net_ofi_rdma_req_t *req);
+	ret = fi_cq_readerr(rail->cq, &err_entry, 0);
+	if (OFI_UNLIKELY(ret == -FI_EAGAIN)) {
+		/*
+		 * Error not available yet.
+		 * fi_cq_read will keep returning -FI_EAVAIL so just bail out and try again later.
+		 */
+		return 0;
+	} else if (OFI_UNLIKELY(ret < 0)) {
+		NCCL_OFI_WARN("Unable to read from fi_cq_readerr. RC: %d. Error: %s",
+			      ret, fi_strerror(-ret));
+		goto exit;
+	}
 
-static int post_flush_req(nccl_net_ofi_rdma_req_t *req);
+	if (err_entry.flags & FI_REMOTE_WRITE) {
+		req = get_req_from_imm_data(ep, err_entry.data);
+		if (!req) {
+			NCCL_OFI_WARN("Unknown remote write error, could not get CQ data");
+			ret = -EIO;
+			goto exit;
+		}
+	} else {
+		/* For all other operations, ctx should be a req */
+		if (!err_entry.op_context) {
+			NCCL_OFI_WARN("Operation with NULL context completed with error");
+			ret = -EIO;
+			goto exit;
+		}
+		req = err_entry.op_context;
+	}
 
-static int post_eager_copy(nccl_net_ofi_rdma_req_t *req);
+	NCCL_OFI_WARN("Request %p completed with error. RC: %d. Error: %s. Completed length: %ld, Request: %s",
+		      req, err_entry.err,
+		      fi_cq_strerror(rail->cq, err_entry.prov_errno, err_entry.err_data, NULL, 0),
+		      (long)err_entry.len, nccl_net_ofi_req_str(req));
+	if (req->type == NCCL_OFI_RDMA_BOUNCE) {
+		/* A bounce buffer receive failed -- this is an internal error so bail out */
+		NCCL_OFI_WARN("Fatal: Bounce buffer recv completed with error");
+	} else {
+		/* Move user-facing request to error state */
+		set_request_state_to_error(req);
+	}
+
+	/*
+	 * Libfabric error codes directly map to ISO C errno values for standard
+	 * error codes up to FI_ERRNO_OFFSET, and libfabric-specific error codes
+	 * beyond. nccl_net_ofi_retval_translate() will figure out
+	 * how to deal with these, so it is safe to pass up the err as-is.
+	 * However, any special-handling for prov_errno should be handled here.
+	 */
+	ret = -err_entry.err;
+exit:
+	return ret;
+}
 
 /*
  * Progress a request associated with recv
@@ -1631,11 +1699,9 @@ static int process_pending_reqs(nccl_net_ofi_rdma_ep_t *ep)
  */
 static int ofi_process_cq(nccl_net_ofi_rdma_ep_t *ep)
 {
+	struct fi_cq_tagged_entry cqe_tagged_buffers[cq_read_count];
 	ssize_t rc = 0;
 	int ret = 0;
-	struct fi_cq_err_entry err_buffer = { 0 };
-	struct fi_cq_tagged_entry cqe_tagged_buffers[cq_read_count];
-	nccl_net_ofi_rdma_req_t *req = NULL;
 
 	for (int rail_id = 0; rail_id != ep->num_rails; ++rail_id) {
 		nccl_net_ofi_ep_rail_t *rail = get_rail(ep, rail_id);
@@ -1649,53 +1715,12 @@ static int ofi_process_cq(nccl_net_ofi_rdma_ep_t *ep)
 				if (OFI_UNLIKELY(ret != 0))
 					goto exit;
 			} else if (OFI_UNLIKELY(rc == -FI_EAVAIL)) {
-				rc = fi_cq_readerr(rail->cq, &err_buffer, 0);
-				if (OFI_UNLIKELY(rc == -FI_EAGAIN)) {
-					/*
-					 * Error not available yet.
-					 * fi_cq_read will keep returning -FI_EAVAIL so just bail out and try again later.
-					 */
+				ret = process_err_completion(ep, rail_id);
+				if (ret == 0)
+					/* Error entry not available yet */
 					break;
-				} else if (OFI_UNLIKELY(rc < 0)) {
-					NCCL_OFI_WARN("Unable to read from fi_cq_readerr. RC: %zd. Error: %s",
-						      rc,
-						      fi_strerror(-rc));
-					ret = ncclSystemError;
-					goto exit;
-				}
-				if (err_buffer.flags & FI_REMOTE_WRITE) {
-					req = get_req_from_imm_data(ep, err_buffer.data);
-					if (!req) {
-						NCCL_OFI_WARN("Unknown remote write error");
-						ret = ncclSystemError;
-						goto exit;
-					}
-				} else {
-					/* For all other operations, ctx should be a req */
-					if (!err_buffer.op_context) {
-						NCCL_OFI_WARN("Operation with NULL context completed with error!");
-						ret = ncclSystemError;
-						goto exit;
-					}
-					req = err_buffer.op_context;
-				}
-
-				NCCL_OFI_WARN("Request %p completed with error. RC: %d. Error: %s. Completed length: %ld, Request: %s",
-					      req,
-					      err_buffer.err,
-					      fi_cq_strerror(rail->cq,
-							     err_buffer.prov_errno,
-							     err_buffer.err_data, NULL, 0),
-					      (long)err_buffer.len,
-					      nccl_net_ofi_req_str(req));
-				set_request_state_to_error(req);
-
-				if (req->type == NCCL_OFI_RDMA_BOUNCE) {
-					/* A bounce buffer receive failed -- this is an internal error so bail out */
-					NCCL_OFI_WARN("Fatal: Bounce buffer recv completed with error");
-					ret = ncclSystemError;
+				else
 					goto exit;
-				}
 			} else if (rc == -FI_EAGAIN) {
 				/* No completions to process */
 				break;