From 0f661351a321181bb66d482178657b260f9b739b Mon Sep 17 00:00:00 2001
From: Nicholas Sielicki <nslick@amazon.com>
Date: Tue, 26 Mar 2024 16:55:43 -0700
Subject: [PATCH] fix(tuner): accept context by argument

Add shims for v1 compat, add context arguments where needed. Also
refactor model code such that it can accept the context as a argument
instead of by global reference.

cr: https://code.amazon.com/reviews/CR-118885749
---
 include/nccl_ofi_tuner.h   |  16 +++--
 src/tuner/nccl_ofi_model.c |  18 +++---
 src/tuner/nccl_ofi_tuner.c | 125 ++++++++++++++++++++++++-------------
 3 files changed, 98 insertions(+), 61 deletions(-)

diff --git a/include/nccl_ofi_tuner.h b/include/nccl_ofi_tuner.h
index 7529163a5..338b44c68 100644
--- a/include/nccl_ofi_tuner.h
+++ b/include/nccl_ofi_tuner.h
@@ -83,24 +83,22 @@ struct nccl_ofi_tuner_model_params {
 	int num_rails;
 };
 
-struct nccl_ofi_tuner_context {
+struct nccl_ofi_tuner_model_dims {
 	/* communicator size */
 	int num_ranks;
 	int num_nodes;
+};
 
+struct nccl_ofi_tuner_context {
+    struct nccl_ofi_tuner_model_dims dims;
 	struct nccl_ofi_tuner_model_params model_params;
 
 	float base_costs[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
 };
 
-/*
- * Global context, allocated at _init(). This is allocated and initialized once
- * per process.
- */
-extern struct nccl_ofi_tuner_context *nccl_ofi_tuner_ctx;
-
 /* Modeling functions */
-void nccl_ofi_tuner_model_costs();
-float nccl_ofi_tuner_compute_cost(ncclFunc_t func, int algo, int proto, int pipe_ops, size_t size);
+void nccl_ofi_tuner_model_costs(struct nccl_ofi_tuner_context *ctx);
+float nccl_ofi_tuner_compute_cost(struct nccl_ofi_tuner_model_params *params, struct nccl_ofi_tuner_model_dims *dims,
+                                  ncclFunc_t func, int algo, int proto, int pipe_ops, size_t size);
 
 #endif /* NCCL_OFI_TUNER_H_ */
diff --git a/src/tuner/nccl_ofi_model.c b/src/tuner/nccl_ofi_model.c
index 98900dfae..58ed91b4e 100644
--- a/src/tuner/nccl_ofi_model.c
+++ b/src/tuner/nccl_ofi_model.c
@@ -17,9 +17,9 @@ float nccl_ofi_tuner_compute_base_cost(ncclFunc_t func, int algo, int proto)
 	return nccl_base_lat[algo][proto];
 }
 
-float nccl_ofi_tuner_compute_cost(ncclFunc_t func, int algo, int proto, int pipe_ops, size_t size)
+float nccl_ofi_tuner_compute_cost(struct nccl_ofi_tuner_model_params *params, struct nccl_ofi_tuner_model_dims *dims,
+                                  ncclFunc_t func, int algo, int proto, int pipe_ops, size_t size)
 {
-	struct nccl_ofi_tuner_model_params *params = &nccl_ofi_tuner_ctx->model_params;
 	float cost = -1;
 	float latency = 0;
 	float bw = 0;
@@ -45,22 +45,22 @@ float nccl_ofi_tuner_compute_cost(ncclFunc_t func, int algo, int proto, int pipe
 	case ncclFuncAllReduce:
 		switch(algo) {
 		case NCCL_ALGO_RING:
-			num_steps = 2 * (nccl_ofi_tuner_ctx->num_ranks - 1);
-			num_internode_steps = 2 * nccl_ofi_tuner_ctx->num_nodes;
+			num_steps = 2 * (dims->num_ranks - 1);
+			num_internode_steps = 2 * dims->num_nodes;
 			latency = (num_internode_steps * net_lat)
 				  + (num_steps - num_internode_steps) * p2p_lat;
 			bw = params->internode_bw * params->num_rails * ofi_nccl_tuner_num_channels();
 			break;
 
 		case NCCL_ALGO_NVLS_TREE:
-			latency = 2 * (p2p_lat + (log2(nccl_ofi_tuner_ctx->num_nodes) * net_lat));
+			latency = 2 * (p2p_lat + (log2(dims->num_nodes) * net_lat));
 			bw = NCCL_OFI_MIN(params->intranode_bw, (params->internode_bw * params->num_rails) / 2)
 			     * ofi_nccl_tuner_num_channels();
 			break;
 
 		case NCCL_ALGO_TREE:
-			latency = ((2 * ((nccl_ofi_tuner_ctx->num_ranks / nccl_ofi_tuner_ctx->num_nodes) - 1) * p2p_lat)
-				   + (2 * log2(nccl_ofi_tuner_ctx->num_nodes) * net_lat));
+			latency = ((2 * ((dims->num_ranks / dims->num_nodes) - 1) * p2p_lat)
+				   + (2 * log2(dims->num_nodes) * net_lat));
 			bw = (params->internode_bw * params->num_rails * ofi_nccl_tuner_num_channels()) / 2;
 			break;
 
@@ -99,14 +99,14 @@ float nccl_ofi_tuner_compute_cost(ncclFunc_t func, int algo, int proto, int pipe
  * Compute the base costs for each of the algorithms at plugin initialization
  * time using only the comm size.
  */
-void nccl_ofi_tuner_model_costs()
+void nccl_ofi_tuner_model_costs(struct nccl_ofi_tuner_context *ctx)
 {
 	ncclFunc_t func;
 	int algo, proto = 0;
 	for (func = 0; func < NCCL_NUM_FUNCTIONS; func++) {
 		for (algo = 0; algo < NCCL_NUM_ALGORITHMS; algo++) {
 			for(proto = 0; proto < NCCL_NUM_PROTOCOLS; proto++) {
-				nccl_ofi_tuner_ctx->base_costs[func][algo][proto] = 
+				ctx->base_costs[func][algo][proto] =
 					nccl_ofi_tuner_compute_base_cost(func, algo, proto);
 			}
 		}
diff --git a/src/tuner/nccl_ofi_tuner.c b/src/tuner/nccl_ofi_tuner.c
index 5250108e6..2c385a56e 100644
--- a/src/tuner/nccl_ofi_tuner.c
+++ b/src/tuner/nccl_ofi_tuner.c
@@ -7,32 +7,15 @@
 #include "nccl_ofi_tuner.h"
 #include "nccl_ofi_log.h"
 
-struct nccl_ofi_tuner_context *nccl_ofi_tuner_ctx;
 pthread_mutex_t nccl_ofi_tuner_ctx_lock = PTHREAD_MUTEX_INITIALIZER;
 ncclDebugLogger_t ofi_log_function = NULL;
 
-ncclResult_t nccl_ofi_tuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction)
+ncclResult_t nccl_ofi_tuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context)
 {
 	ofi_log_function = logFunction;
+	struct nccl_ofi_tuner_context *nccl_ofi_tuner_ctx;
 
-	/*
-	 * NCCL parses these variables and applies user filters inside its
-	 * current tuner logic. Ideally, this should be done regardless of the
-	 * use of NCCL's internal tuner or an external tuner plugin. For the
-	 * time being, given the external tuner is an opt-in, detect if a user
-	 * has set one of them and bail when an external tuner is loaded.
-	 */
-	if (getenv("NCCL_ALGO") || getenv("NCCL_PROTO")) {
-		NCCL_OFI_WARN("The tuner plugin can not be loaded when explicitly choosing an algorithm or protocol with NCCL_ALGO/NCCL_PROTO");
-		// FIXME: "ncclInvalidUsage should be returned when the error is
-		// most likely a user error" per nccl docs, which arguably makes
-		// it a better return code here than ncclInvalidArgument, but
-		// the former is currently not vended in ext-net headers, so
-		// we're returning ncclInvalidArgument instead.
-		return ncclInvalidArgument;
-	}
-
-	struct nccl_ofi_tuner_model_params params = {
+	const struct nccl_ofi_tuner_model_params params = {
 		.net_lat = ofi_nccl_tuner_net_latency(),
 		.internode_bw = NCCL_OFI_TUNER_INTERNODE_BW,
 		.intranode_bw = NCCL_OFI_TUNER_INTRANODE_BW,
@@ -44,38 +27,38 @@ ncclResult_t nccl_ofi_tuner_init(size_t nRanks, size_t nNodes, ncclDebugLogger_t
 	 * initialization. For now, init a plugin-lobal context once.
 	 */ 
 	pthread_mutex_lock(&nccl_ofi_tuner_ctx_lock);
+	nccl_ofi_tuner_ctx = calloc(1, sizeof(struct nccl_ofi_tuner_context));
 	if (!nccl_ofi_tuner_ctx) {
-		nccl_ofi_tuner_ctx = calloc(1, sizeof(struct nccl_ofi_tuner_context));
-		if (!nccl_ofi_tuner_ctx) {
-			NCCL_OFI_WARN("Context allocation failed.");
-			return ncclInternalError;
-		}
+		NCCL_OFI_WARN("Context allocation failed.");
+		return ncclInternalError;
+	}
 
-		nccl_ofi_tuner_ctx->num_ranks = nRanks;
-		nccl_ofi_tuner_ctx->num_nodes = nNodes;
-		nccl_ofi_tuner_ctx->model_params = params;
+	nccl_ofi_tuner_ctx->dims.num_ranks = nRanks;
+	nccl_ofi_tuner_ctx->dims.num_nodes = nNodes;
+	nccl_ofi_tuner_ctx->model_params = params;
 
-		/*
-		 * Build cost model to use from nccl_ofi_tuner_get_coll_info.
-		 */
-		nccl_ofi_tuner_model_costs();
-	}
+	/*
+	 * Build cost model to use from nccl_ofi_tuner_get_coll_info.
+	 */
+	nccl_ofi_tuner_model_costs(nccl_ofi_tuner_ctx);
+	*context = (void*)nccl_ofi_tuner_ctx;
 	pthread_mutex_unlock(&nccl_ofi_tuner_ctx_lock);
 
 	NCCL_OFI_TRACE(NCCL_TUNING, "Tuner init: comm with %ld ranks and %ld nodes.", nRanks, nNodes);
 	return ncclSuccess;
 }
 
-ncclResult_t nccl_ofi_tuner_get_coll_info(ncclFunc_t collType, size_t nBytes,
+ncclResult_t nccl_ofi_tuner_get_coll_info(void *context, ncclFunc_t collType, size_t nBytes,
 				  int collNetSupport, int nvlsSupport, int numPipeOps,
 				  int *algorithm, int *protocol, int* nChannels)
 {
 	float cost = 0;
 	float lowest = FLT_MAX;
 	int algo, proto = 0;
+	struct nccl_ofi_tuner_context *nccl_ofi_tuner_ctx = (struct nccl_ofi_tuner_context *)context;
 
 	/* Skip runs smaller than 2 nodes and fallback to NCCL's internal tunings */
-	if (nccl_ofi_tuner_ctx->num_nodes <= 2)
+	if (nccl_ofi_tuner_ctx->dims.num_nodes <= 2)
 		return ncclSuccess;
 
 	/*
@@ -100,7 +83,8 @@ ncclResult_t nccl_ofi_tuner_get_coll_info(ncclFunc_t collType, size_t nBytes,
 			if (algo == NCCL_ALGO_NVLS_TREE && proto != NCCL_PROTO_SIMPLE)
 				continue;
 
-			cost = nccl_ofi_tuner_compute_cost(collType, algo, proto, numPipeOps,  nBytes);
+			cost = nccl_ofi_tuner_compute_cost(&nccl_ofi_tuner_ctx->model_params, &nccl_ofi_tuner_ctx->dims,
+							   collType, algo, proto, numPipeOps,  nBytes);
 			if (cost < 0)
 				continue;
 
@@ -118,21 +102,76 @@ ncclResult_t nccl_ofi_tuner_get_coll_info(ncclFunc_t collType, size_t nBytes,
 	return ncclSuccess;
 }
 
-ncclResult_t nccl_ofi_tuner_destroy()
+ncclResult_t nccl_ofi_tuner_destroy(void *context)
 {
 	pthread_mutex_lock(&nccl_ofi_tuner_ctx_lock);
-	free(nccl_ofi_tuner_ctx);
-	/* Prevent other threads from freeing a dangling global ctx */
-	nccl_ofi_tuner_ctx = NULL;
+	if (context != NULL) {
+		free(context);
+	}
 	pthread_mutex_unlock(&nccl_ofi_tuner_ctx_lock);
 
 	return ncclSuccess;
 }
 
+const ncclTuner_v2_t ncclTunerPlugin_v2 = {
+	.name = "nccl_ofi_tuner",
+	.init = nccl_ofi_tuner_init,
+	.getCollInfo = nccl_ofi_tuner_get_coll_info,
+	.destroy = nccl_ofi_tuner_destroy
+};
+
+#if !defined(AWS_OFI_NCCL_MIN_TUNER_COMPAT) || (AWS_OFI_NCCL_MIN_TUNER_COMPAT <= 1)
+static struct nccl_ofi_tuner_context *nccl_ofi_tuner_ctx_internal;
+
+static ncclResult_t nccl_ofi_tuner_destroy_v1(void)
+{
+	void *context = NULL;
+
+	pthread_mutex_lock(&nccl_ofi_tuner_ctx_lock);
+	if (nccl_ofi_tuner_ctx_internal != NULL) {
+		/* Prevent other threads from freeing a dangling global ctx */
+		context = (void*)nccl_ofi_tuner_ctx_internal;
+		nccl_ofi_tuner_ctx_internal = NULL;
+	}
+	pthread_mutex_unlock(&nccl_ofi_tuner_ctx_lock);
+
+	return nccl_ofi_tuner_destroy(context);
+}
+
+static ncclResult_t nccl_ofi_tuner_init_v1(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction)
+{
+	/*
+	 * NCCL parses these variables and applies user filters inside its
+	 * current tuner logic. Ideally, this should be done regardless of the
+	 * use of NCCL's internal tuner or an external tuner plugin. For the
+	 * time being, given the external tuner is an opt-in, detect if a user
+	 * has set one of them and bail when an external tuner is loaded.
+	 */
+	if (getenv("NCCL_ALGO") || getenv("NCCL_PROTO")) {
+		NCCL_OFI_WARN("The tuner plugin can not be loaded when explicitly choosing an algorithm or protocol with NCCL_ALGO/NCCL_PROTO");
+		// FIXME: "ncclInvalidUsage should be returned when the error is
+		// most likely a user error" per nccl docs, which arguably makes
+		// it a better return code here than ncclInvalidArgument, but
+		// the former is currently not vended in ext-net headers, so
+		// we're returning ncclInvalidArgument instead.
+		return ncclInvalidArgument;
+	}
+	return nccl_ofi_tuner_init(nRanks, nNodes, logFunction, (void**)&nccl_ofi_tuner_ctx_internal);
+}
+
+static ncclResult_t nccl_ofi_tuner_get_coll_info_v1(ncclFunc_t collType, size_t nBytes, int collNetSupport,
+						    int nvlsSupport, int numPipeOps, int *algorithm, int *protocol,
+						    int *nChannels)
+{
+	return nccl_ofi_tuner_get_coll_info(&nccl_ofi_tuner_ctx_internal, collType, nBytes,
+					    collNetSupport, nvlsSupport, numPipeOps, algorithm,
+					    protocol, nChannels);
+}
 
 const ncclTuner_v1_t ncclTunerPlugin_v1 = {
   .name = "nccl_ofi_tuner",
-  .init = nccl_ofi_tuner_init,
-  .getCollInfo = nccl_ofi_tuner_get_coll_info,
-  .destroy = nccl_ofi_tuner_destroy
+  .init = nccl_ofi_tuner_init_v1,
+  .getCollInfo = nccl_ofi_tuner_get_coll_info_v1,
+  .destroy = nccl_ofi_tuner_destroy_v1
 };
+#endif /* !defined(AWS_OFI_NCCL_MIN_TUNER_COMPAT) || (AWS_OFI_NCCL_MIN_TUNER_COMPAT <= 1) */