aws · ryanhankins · Jun 6, 2024 · Jun 6, 2024
@@ -93,10 +93,30 @@ CHECK_PKG_NEURON([AS_IF([test -n "${want_cuda}"],
                         [AC_MSG_ERROR([Cannot enable both CUDA and neuron.])],
                         [want_cuda=no])
                   have_device_interface=neuron])
-CHECK_PKG_CUDA([have_device_interface=cuda])
-
+# Select CUDA if Neuron wasn't specified and --with-rocm was not used.
+CHECK_PKG_CUDA(AS_IF([test "${have_device_interface}" = "no"],
+	       AS_IF([test -z "$with_rocm"], [have_device_interface=cuda])))
+# If neither CUDA nor Neuron is being used, select ROCm
+CHECK_PKG_ROCM(AS_IF([test "${have_device_interface}" = "no"], [have_device_interface=rocm]))
 AS_IF([test "${have_device_interface}" = "no"],
-      [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA or Neuron runtime.])])
+      [AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA, ROCm or Neuron runtime.])])
+
+do_cuda=0
+do_rocm=0
+AS_IF([test -n "$with_rocm"],
+	[AS_IF([test "$have_device_interface" = "rocm"],
+		[enable_tests="no"
+		 do_rocm=1
+		 ])],
+	[AS_IF([test "$have_device_interface" = "cuda"], [do_cuda=1])])
+
+AC_DEFINE_UNQUOTED([HAVE_CUDA], [${do_cuda}], [Defined to 1 if CUDA is available])
+AM_CONDITIONAL([HAVE_CUDA], [test ${do_cuda} = 1])
+
+AC_DEFINE_UNQUOTED([HAVE_ROCM], [${do_rocm}], [Defined to 1 if ROCm is available])
+AM_CONDITIONAL([HAVE_ROCM], [test ${do_rocm} = 1])
+AS_IF([test ${do_rocm} = 1],
+	AC_DEFINE_UNQUOTED( [__HIP_PLATFORM_AMD__], [ 1 ], [Select AMD/ROCm HIP APIs] ))
 
 CHECK_PKG_HWLOC([],
 		[AC_MSG_ERROR([Could not find the hwloc library. Use --with-hwloc to provide the path to non-standard hwloc installation.])])

@@ -5,7 +5,7 @@
 #ifndef NCCL_HEADERS_ERROR_H
 #define NCCL_HEADERS_ERROR_H
 
-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_ROCM
 #include "nccl-headers/nvidia/err.h"
 #elif HAVE_NEURON
 #include "nccl-headers/neuron/error.h"

@@ -5,7 +5,7 @@
 #ifndef NCCL_HEADERS_NET_H
 #define NCCL_HEADERS_NET_H
 
-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_ROCM
 #include "nccl-headers/nvidia/net.h"
 #elif HAVE_NEURON
 #include "nccl-headers/neuron/net.h"

@@ -12,7 +12,15 @@ extern "C" {
 
 #include <cuda.h>
 
-int nccl_net_ofi_cuda_init(void);
+/*
+ * Error checking is currently just success or failure.
+ */
+enum {
+	GPU_SUCCESS = 0,
+	GPU_ERROR = 999  /* Match CUDA_UNKNOWN_ERROR value */
+};
+
+int nccl_net_ofi_gpu_init(void);
 
 /*
  * @brief	Gets the CUDA device associated with the buffer
@@ -27,18 +35,16 @@ int nccl_net_ofi_cuda_init(void);
  */
 int nccl_net_ofi_get_cuda_device(void *data, int *dev_id);
 
-extern CUresult (*nccl_net_ofi_cuDriverGetVersion)(int *driverVersion);
-
-extern CUresult (*nccl_net_ofi_cuPointerGetAttribute)(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
-
-extern CUresult (*nccl_net_ofi_cuCtxGetDevice)(CUdevice *device);
-extern CUresult (*nccl_net_ofi_cuDeviceGetCount)(int* count);
+extern int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion);
+extern int nccl_net_ofi_gpuCtxGetDevice(CUdevice *device);
+extern int nccl_net_ofi_gpuDeviceGetCount(int* count);
 
 #if CUDA_VERSION >= 11030
-extern CUresult (*nccl_net_ofi_cuFlushGPUDirectRDMAWrites)(CUflushGPUDirectRDMAWritesTarget target,
-							   CUflushGPUDirectRDMAWritesScope scope);
+extern int nccl_net_ofi_gpuFlushGPUDirectRDMAWrites();
+#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 1
 #else
-extern void *nccl_net_ofi_cuFlushGPUDirectRDMAWrites;
+extern void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites;
+#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 0
 #endif
 
 #ifdef _cplusplus

@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024 Hewlett Packard Enterprise Development LP
+ * Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef NCCL_OFI_CUDA_H_
+#define NCCL_OFI_CUDA_H_
+
+#ifdef _cplusplus
+extern "C" {
+#endif
+
+#include <hip/hip_runtime_api.h>
+
+/*
+ * Error checking is currently just success or failure.
+ */
+enum {
+        GPU_SUCCESS = 0,
+        GPU_ERROR = 999 /* Match hipErrorUnknown */
+};
+
+int nccl_net_ofi_gpu_init(void);
+
+/*
+ * @brief      Gets the GPU device associated with the buffer
+ *
+ * @param      data
+ *             Pointer to GPU buffer.
+ *
+ * @return     Valid GPU device ID on success
+ *             -1 on error
+ * @return     0 on success
+ *             non-zero on error
+ */
+int nccl_net_ofi_get_cuda_device(void *data, int *dev_id);
+int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion);
+int nccl_net_ofi_gpuCtxGetDevice(int *device);
+int nccl_net_ofi_gpuDeviceGetCount(int* count);
+
+extern void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites;
+#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 0
+
+#ifdef _cplusplus
+} // End extern "C"
+#endif
+
+#endif // End NCCL_OFI_H_
@@ -49,9 +49,6 @@ AC_DEFUN([CHECK_PKG_CUDA], [
          CPPFLAGS="${check_pkg_CPPFLAGS_save}"
          $2])
 
-  AC_DEFINE_UNQUOTED([HAVE_CUDA], [${check_pkg_define}], [Defined to 1 if CUDA is available])
-  AM_CONDITIONAL([HAVE_CUDA], [test "${check_pkg_found}" = "yes"])
-
   AC_SUBST([CUDA_LDFLAGS])
   AC_SUBST([CUDA_LIBS])
 

@@ -0,0 +1,52 @@
+# -*- autoconf -*-
+#
+# Copyright (c) 2024      Hewlett Packard Enterprise Development LP
+# Copyright (c) 2023      Amazon.com, Inc. or its affiliates. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+AC_DEFUN([CHECK_PKG_ROCM], [
+  check_pkg_found="yes"
+  check_pkg_CPPFLAGS_save="${CPPFLAGS}"
+  check_pkg_LDFLAGS_save="${LDFLAGS}"
+  check_pkg_LIBS_save="${LIBS}"
+
+  AC_ARG_WITH([rocm],
+             [AS_HELP_STRING([--with-rocm=PATH], [Path to non-standard ROCm installation])])
+
+  AS_IF([test -z "${with-rocm}" -o "{with_rocm}" = "yes"],
+        [],
+        [test "${with_rocm}" = "no"],
+        [check_pkg_found=no],
+        [AS_IF([test -d ${with_rocm}/lib64], [check_pkg_libdir="lib64"], [check_pkg_libdir="lib"])
+        CPPFLAGS="-I${with_rocm}/include ${CPPFLAGS}"
+        LDFLAGS="-L${with_rocm}/${check_pkg_libdir} ${LDFLAGS}"])
+
+  AS_IF([test "${check_pkg_found}" = "yes"],
+        [AC_CHECK_LIB([amdhip64], [hipMemAllocHost], [], [check_pkg_found=no])])
+
+  AS_IF([test "${check_pkg_found}" = "yes"],
+        [AC_CHECK_HEADERS([hip/hip_runtime_api.h], [], [check_pkg_found=no], [#define __HIP_PLATFORM_AMD__])])
+
+
+  AS_IF([test "${check_pkg_found}" = "yes"],
+        [check_pkg_define="yes"],
+        [check_pkg_define="no"
+         CPPFLAGS="${check_pkg_CPPFLAGS_save}"
+         LDFLAGS="${check_pkg_LDFLAGS_save}"
+         LIBS="${check_pkg_LIBS_save}"
+        ])
+
+  AS_IF([test -n "${with_rocm}"],
+       [AS_IF([test "${check_pkg_define}" = "yes"],
+              [$1], [$2] )
+       ], [$2]
+   )
+
+  AS_UNSET([check_pkg_found])
+  AS_UNSET([check_pkg_define])
+  AS_UNSET([check_pkg_CPPFLAGS_save])
+  AS_UNSET([check_pkg_LDFLAGS_save])
+  AS_UNSET([check_pkg_LIBS_save])
+])
@@ -26,52 +26,58 @@ sources = \
 	nccl_ofi_ep_addr_list.c \
 	tracepoint.c
 
+tuner_sources = \
+	tuner/nccl_ofi_regions.c \
+	tuner/nccl_ofi_tuner.c
+
 if WANT_PLATFORM_AWS
 sources += platform-aws.c
 endif
 
 if ENABLE_NEURON
   sources += nccl_ofi_interface_neuron.c
-else
-  sources += nccl_ofi_cuda.c \
-	nccl_ofi_interface_nvidia.c
-endif
-
-# Build an internal-only library that can be used by unit tests as
-# well as the actual nccl_net.so / nccom_net.so libraries.  This saves
-# us writing dlopen() handlers for simple unit tests.
-noinst_LTLIBRARIES = libinternal_net_plugin.la
-libinternal_net_plugin_la_SOURCES = $(sources)
-libinternal_net_plugin_la_LDFLAGS = -avoid-version
 
-if ENABLE_NEURON
   lib_LTLIBRARIES = libnccom-net.la
   libnccom_net_la_SOURCES =
   libnccom_net_la_LIBADD = libinternal_net_plugin.la
   libnccom_net_la_LDFLAGS = -module -avoid-version
+endif
+
+if HAVE_CUDA
+  sources += nccl_ofi_cuda.c nccl_ofi_interface_nvidia.c
+if WANT_PLATFORM_AWS
+  # NCCL tuner plugin
+  lib_LTLIBRARIES = libnccl-net.la libnccl-ofi-tuner.la
+  libnccl_ofi_tuner_la_SOURCES = $(tuner_sources)
+  libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version
 else
   lib_LTLIBRARIES = libnccl-net.la
+endif
+
   libnccl_net_la_SOURCES =
   libnccl_net_la_LIBADD = libinternal_net_plugin.la
   libnccl_net_la_LDFLAGS = -module -avoid-version
 endif
 
+if HAVE_ROCM
+  sources += nccl_ofi_rocm.c nccl_ofi_interface_nvidia.c
+
+  lib_LTLIBRARIES = librccl-net.la
+  librccl_net_la_SOURCES =
+  librccl_net_la_LIBADD = libinternal_net_plugin.la
+  librccl_net_la_LDFLAGS = -module -avoid-version
+endif
+
+# Build an internal-only library that can be used by unit tests as
+# well as the actual nccl_net.so / nccom_net.so libraries.  This saves
+# us writing dlopen() handlers for simple unit tests.
+noinst_LTLIBRARIES = libinternal_net_plugin.la
+libinternal_net_plugin_la_SOURCES = $(sources)
+libinternal_net_plugin_la_LDFLAGS = -avoid-version
 
 #
 # Tuner
 #
 noinst_LTLIBRARIES += libinternal_tuner_plugin.la
-tuner_sources = \
-	tuner/nccl_ofi_regions.c \
-	tuner/nccl_ofi_tuner.c
 libinternal_tuner_plugin_la_SOURCES = $(tuner_sources)
 libinternal_tuner_plugin_la_LDFLAGS = -avoid-version
-
-if HAVE_CUDA
-if WANT_PLATFORM_AWS
-  # NCCL tuner plugin
-  lib_LTLIBRARIES += libnccl-ofi-tuner.la
-  libnccl_ofi_tuner_la_SOURCES = $(tuner_sources)
-  libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version
-endif
-endif
@@ -297,7 +297,7 @@ ncclResult_t nccl_net_ofi_regMr(void *comm, void *data, size_t size, int type,
 	/* Validate type of buffer */
 	bool valid_buffer_type = false;
 	if (type == NCCL_PTR_HOST) valid_buffer_type = true;
-#if HAVE_CUDA
+#if HAVE_CUDA || HAVE_ROCM
 	if (type == NCCL_PTR_CUDA) valid_buffer_type = true;
 #endif
 #if HAVE_NEURON

@@ -12,17 +12,37 @@
 #include "nccl_ofi_cuda.h"
 #include "nccl_ofi_log.h"
 
-CUresult (*nccl_net_ofi_cuDriverGetVersion)(int *driverVersion) = NULL;
-CUresult (*nccl_net_ofi_cuPointerGetAttribute)(void *data, CUpointer_attribute attribute, CUdeviceptr ptr) = NULL;
-CUresult (*nccl_net_ofi_cuCtxGetDevice)(CUdevice *device) = NULL;
-CUresult (*nccl_net_ofi_cuDeviceGetCount)(int *count) = NULL;
+static CUresult (*nccl_net_ofi_cuDriverGetVersion)(int *driverVersion) = NULL;
+static CUresult (*nccl_net_ofi_cuPointerGetAttribute)(void *data, CUpointer_attribute attribute, CUdeviceptr ptr) = NULL;
+static CUresult (*nccl_net_ofi_cuCtxGetDevice)(CUdevice *device) = NULL;
+static CUresult (*nccl_net_ofi_cuDeviceGetCount)(int *count) = NULL;
 #if CUDA_VERSION >= 11030
-CUresult (*nccl_net_ofi_cuFlushGPUDirectRDMAWrites)(CUflushGPUDirectRDMAWritesTarget target,
-						    CUflushGPUDirectRDMAWritesScope scope) = NULL;
+static CUresult (*nccl_net_ofi_cuFlushGPUDirectRDMAWrites)(CUflushGPUDirectRDMAWritesTarget target,
+							   CUflushGPUDirectRDMAWritesScope scope) = NULL;
 #else
 void *nccl_net_ofi_cuFlushGPUDirectRDMAWrites = NULL;
 #endif
 
+int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion) {
+	return nccl_net_ofi_cuDriverGetVersion(driverVersion) == CUDA_SUCCESS ? GPU_SUCCESS : GPU_ERROR;
+}
+
+int nccl_net_ofi_gpuCtxGetDevice(int *device) {
+	return nccl_net_ofi_cuCtxGetDevice((CUdevice *)device)  == CUDA_SUCCESS ? GPU_SUCCESS : GPU_ERROR;
+}
+
+int nccl_net_ofi_gpuDeviceGetCount(int *count) {
+	return nccl_net_ofi_cuDeviceGetCount(count) == CUDA_SUCCESS ? GPU_SUCCESS : GPU_ERROR;
+}
+
+#if CUDA_VERSION >= 11030
+int nccl_net_ofi_gpuFlushGPUDirectRDMAWrites() {
+	return nccl_net_ofi_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
+						       CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER) ==
+						       CUDA_SUCCESS ? GPU_SUCCESS : GPU_ERROR;
+}
+#endif
+
 #define STRINGIFY(sym) # sym
 
 #define LOAD_SYM(sym)                                                              \
@@ -34,7 +54,7 @@ void *nccl_net_ofi_cuFlushGPUDirectRDMAWrites = NULL;
 	}
 
 int
-nccl_net_ofi_cuda_init(void)
+nccl_net_ofi_gpu_init(void)
 {
 	int ret = 0;
 	void *cudadriver_lib = NULL;
@@ -77,7 +97,7 @@ int nccl_net_ofi_get_cuda_device(void *data, int *dev_id)
 	CUresult cuda_ret_mem = nccl_net_ofi_cuPointerGetAttribute(&mem_type,
 								   CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
 								   (CUdeviceptr) data);
-	CUresult cuda_ret_dev = nccl_net_ofi_cuPointerGetAttribute(&device_ordinal,	
+	CUresult cuda_ret_dev = nccl_net_ofi_cuPointerGetAttribute(&device_ordinal,
 								   CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
 								   (CUdeviceptr) data);