Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ROCm as alternative to CUDA for plugin use #461

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 23 additions & 3 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,30 @@ CHECK_PKG_NEURON([AS_IF([test -n "${want_cuda}"],
[AC_MSG_ERROR([Cannot enable both CUDA and neuron.])],
[want_cuda=no])
have_device_interface=neuron])
CHECK_PKG_CUDA([have_device_interface=cuda])

# Select CUDA if Neuron wasn't specified and --with-rocm was not used.
CHECK_PKG_CUDA(AS_IF([test "${have_device_interface}" = "no"],
AS_IF([test -z "$with_rocm"], [have_device_interface=cuda])))
# If neither CUDA nor Neuron is being used, select ROCm
CHECK_PKG_ROCM(AS_IF([test "${have_device_interface}" = "no"], [have_device_interface=rocm]))
AS_IF([test "${have_device_interface}" = "no"],
[AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA or Neuron runtime.])])
[AC_MSG_ERROR([NCCL OFI Plugin requires either CUDA, ROCm or Neuron runtime.])])

do_cuda=0
do_rocm=0
AS_IF([test -n "$with_rocm"],
[AS_IF([test "$have_device_interface" = "rocm"],
[enable_tests="no"
do_rocm=1
])],
[AS_IF([test "$have_device_interface" = "cuda"], [do_cuda=1])])

AC_DEFINE_UNQUOTED([HAVE_CUDA], [${do_cuda}], [Defined to 1 if CUDA is available])
AM_CONDITIONAL([HAVE_CUDA], [test ${do_cuda} = 1])

AC_DEFINE_UNQUOTED([HAVE_ROCM], [${do_rocm}], [Defined to 1 if ROCm is available])
AM_CONDITIONAL([HAVE_ROCM], [test ${do_rocm} = 1])
AS_IF([test ${do_rocm} = 1],
AC_DEFINE_UNQUOTED( [__HIP_PLATFORM_AMD__], [ 1 ], [Select AMD/ROCm HIP APIs] ))

CHECK_PKG_HWLOC([],
[AC_MSG_ERROR([Could not find the hwloc library. Use --with-hwloc to provide the path to non-standard hwloc installation.])])
Expand Down
2 changes: 1 addition & 1 deletion include/nccl-headers/error.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#ifndef NCCL_HEADERS_ERROR_H
#define NCCL_HEADERS_ERROR_H

#if HAVE_CUDA
#if HAVE_CUDA || HAVE_ROCM
#include "nccl-headers/nvidia/err.h"
#elif HAVE_NEURON
#include "nccl-headers/neuron/error.h"
Expand Down
2 changes: 1 addition & 1 deletion include/nccl-headers/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#ifndef NCCL_HEADERS_NET_H
#define NCCL_HEADERS_NET_H

#if HAVE_CUDA
#if HAVE_CUDA || HAVE_ROCM
#include "nccl-headers/nvidia/net.h"
#elif HAVE_NEURON
#include "nccl-headers/neuron/net.h"
Expand Down
26 changes: 16 additions & 10 deletions include/nccl_ofi_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,15 @@ extern "C" {

#include <cuda.h>

int nccl_net_ofi_cuda_init(void);
/*
* Error checking is currently just success or failure.
*/
enum {
GPU_SUCCESS = 0,
GPU_ERROR = 999 /* Match CUDA_UNKNOWN_ERROR value */
};

int nccl_net_ofi_gpu_init(void);

/*
* @brief Gets the CUDA device associated with the buffer
Expand All @@ -27,18 +35,16 @@ int nccl_net_ofi_cuda_init(void);
*/
int nccl_net_ofi_get_cuda_device(void *data, int *dev_id);

extern CUresult (*nccl_net_ofi_cuDriverGetVersion)(int *driverVersion);

extern CUresult (*nccl_net_ofi_cuPointerGetAttribute)(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);

extern CUresult (*nccl_net_ofi_cuCtxGetDevice)(CUdevice *device);
extern CUresult (*nccl_net_ofi_cuDeviceGetCount)(int* count);
extern int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion);
extern int nccl_net_ofi_gpuCtxGetDevice(CUdevice *device);
extern int nccl_net_ofi_gpuDeviceGetCount(int* count);

#if CUDA_VERSION >= 11030
extern CUresult (*nccl_net_ofi_cuFlushGPUDirectRDMAWrites)(CUflushGPUDirectRDMAWritesTarget target,
CUflushGPUDirectRDMAWritesScope scope);
extern int nccl_net_ofi_gpuFlushGPUDirectRDMAWrites();
#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 1
#else
extern void *nccl_net_ofi_cuFlushGPUDirectRDMAWrites;
extern void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites;
#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 0
#endif

#ifdef _cplusplus
Expand Down
49 changes: 49 additions & 0 deletions include/nccl_ofi_rocm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright (c) 2024 Hewlett Packard Enterprise Development LP
* Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All rights reserved.
* Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
*/

#ifndef NCCL_OFI_CUDA_H_
#define NCCL_OFI_CUDA_H_

#ifdef _cplusplus
extern "C" {
#endif

#include <hip/hip_runtime_api.h>

/*
* Error checking is currently just success or failure.
*/
enum {
GPU_SUCCESS = 0,
GPU_ERROR = 999 /* Match hipErrorUnknown */
};

int nccl_net_ofi_gpu_init(void);

/*
* @brief Gets the GPU device associated with the buffer
*
* @param data
* Pointer to GPU buffer.
*
* @return Valid GPU device ID on success
* -1 on error
* @return 0 on success
* non-zero on error
*/
int nccl_net_ofi_get_cuda_device(void *data, int *dev_id);
int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion);
int nccl_net_ofi_gpuCtxGetDevice(int *device);
int nccl_net_ofi_gpuDeviceGetCount(int* count);

extern void *nccl_net_ofi_gpuFlushGPUDirectRDMAWrites;
#define HAVE_FLUSH_GPU_DIRECT_RDMA_WRITE 0

#ifdef _cplusplus
} // End extern "C"
#endif

#endif // End NCCL_OFI_H_
3 changes: 0 additions & 3 deletions m4/check_pkg_cuda.m4
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ AC_DEFUN([CHECK_PKG_CUDA], [
CPPFLAGS="${check_pkg_CPPFLAGS_save}"
$2])
AC_DEFINE_UNQUOTED([HAVE_CUDA], [${check_pkg_define}], [Defined to 1 if CUDA is available])
AM_CONDITIONAL([HAVE_CUDA], [test "${check_pkg_found}" = "yes"])
AC_SUBST([CUDA_LDFLAGS])
AC_SUBST([CUDA_LIBS])
Expand Down
52 changes: 52 additions & 0 deletions m4/check_pkg_rocm.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- autoconf -*-
#
# Copyright (c) 2024 Hewlett Packard Enterprise Development LP
# Copyright (c) 2023 Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

AC_DEFUN([CHECK_PKG_ROCM], [
check_pkg_found="yes"
check_pkg_CPPFLAGS_save="${CPPFLAGS}"
check_pkg_LDFLAGS_save="${LDFLAGS}"
check_pkg_LIBS_save="${LIBS}"
AC_ARG_WITH([rocm],
[AS_HELP_STRING([--with-rocm=PATH], [Path to non-standard ROCm installation])])
AS_IF([test -z "${with-rocm}" -o "{with_rocm}" = "yes"],
[],
[test "${with_rocm}" = "no"],
[check_pkg_found=no],
[AS_IF([test -d ${with_rocm}/lib64], [check_pkg_libdir="lib64"], [check_pkg_libdir="lib"])
CPPFLAGS="-I${with_rocm}/include ${CPPFLAGS}"
LDFLAGS="-L${with_rocm}/${check_pkg_libdir} ${LDFLAGS}"])
AS_IF([test "${check_pkg_found}" = "yes"],
[AC_CHECK_LIB([amdhip64], [hipMemAllocHost], [], [check_pkg_found=no])])
AS_IF([test "${check_pkg_found}" = "yes"],
[AC_CHECK_HEADERS([hip/hip_runtime_api.h], [], [check_pkg_found=no], [#define __HIP_PLATFORM_AMD__])])
AS_IF([test "${check_pkg_found}" = "yes"],
[check_pkg_define="yes"],
[check_pkg_define="no"
CPPFLAGS="${check_pkg_CPPFLAGS_save}"
LDFLAGS="${check_pkg_LDFLAGS_save}"
LIBS="${check_pkg_LIBS_save}"
])
AS_IF([test -n "${with_rocm}"],
[AS_IF([test "${check_pkg_define}" = "yes"],
[$1], [$2] )
], [$2]
)
AS_UNSET([check_pkg_found])
AS_UNSET([check_pkg_define])
AS_UNSET([check_pkg_CPPFLAGS_save])
AS_UNSET([check_pkg_LDFLAGS_save])
AS_UNSET([check_pkg_LIBS_save])
])
54 changes: 30 additions & 24 deletions src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -26,52 +26,58 @@ sources = \
nccl_ofi_ep_addr_list.c \
tracepoint.c

tuner_sources = \
tuner/nccl_ofi_regions.c \
tuner/nccl_ofi_tuner.c

if WANT_PLATFORM_AWS
sources += platform-aws.c
endif

if ENABLE_NEURON
sources += nccl_ofi_interface_neuron.c
else
sources += nccl_ofi_cuda.c \
nccl_ofi_interface_nvidia.c
endif

# Build an internal-only library that can be used by unit tests as
# well as the actual nccl_net.so / nccom_net.so libraries. This saves
# us writing dlopen() handlers for simple unit tests.
noinst_LTLIBRARIES = libinternal_net_plugin.la
libinternal_net_plugin_la_SOURCES = $(sources)
libinternal_net_plugin_la_LDFLAGS = -avoid-version

if ENABLE_NEURON
lib_LTLIBRARIES = libnccom-net.la
libnccom_net_la_SOURCES =
libnccom_net_la_LIBADD = libinternal_net_plugin.la
libnccom_net_la_LDFLAGS = -module -avoid-version
endif

if HAVE_CUDA
sources += nccl_ofi_cuda.c nccl_ofi_interface_nvidia.c
if WANT_PLATFORM_AWS
# NCCL tuner plugin
lib_LTLIBRARIES = libnccl-net.la libnccl-ofi-tuner.la
libnccl_ofi_tuner_la_SOURCES = $(tuner_sources)
libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version
else
lib_LTLIBRARIES = libnccl-net.la
endif

libnccl_net_la_SOURCES =
libnccl_net_la_LIBADD = libinternal_net_plugin.la
libnccl_net_la_LDFLAGS = -module -avoid-version
endif

if HAVE_ROCM
sources += nccl_ofi_rocm.c nccl_ofi_interface_nvidia.c

lib_LTLIBRARIES = librccl-net.la
librccl_net_la_SOURCES =
librccl_net_la_LIBADD = libinternal_net_plugin.la
librccl_net_la_LDFLAGS = -module -avoid-version
endif

# Build an internal-only library that can be used by unit tests as
# well as the actual nccl_net.so / nccom_net.so libraries. This saves
# us writing dlopen() handlers for simple unit tests.
noinst_LTLIBRARIES = libinternal_net_plugin.la
libinternal_net_plugin_la_SOURCES = $(sources)
libinternal_net_plugin_la_LDFLAGS = -avoid-version

#
# Tuner
#
noinst_LTLIBRARIES += libinternal_tuner_plugin.la
tuner_sources = \
tuner/nccl_ofi_regions.c \
tuner/nccl_ofi_tuner.c
libinternal_tuner_plugin_la_SOURCES = $(tuner_sources)
libinternal_tuner_plugin_la_LDFLAGS = -avoid-version

if HAVE_CUDA
if WANT_PLATFORM_AWS
# NCCL tuner plugin
lib_LTLIBRARIES += libnccl-ofi-tuner.la
libnccl_ofi_tuner_la_SOURCES = $(tuner_sources)
libnccl_ofi_tuner_la_LDFLAGS = -module -avoid-version
endif
endif
2 changes: 1 addition & 1 deletion src/nccl_ofi_api.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ ncclResult_t nccl_net_ofi_regMr(void *comm, void *data, size_t size, int type,
/* Validate type of buffer */
bool valid_buffer_type = false;
if (type == NCCL_PTR_HOST) valid_buffer_type = true;
#if HAVE_CUDA
#if HAVE_CUDA || HAVE_ROCM
if (type == NCCL_PTR_CUDA) valid_buffer_type = true;
#endif
#if HAVE_NEURON
Expand Down
36 changes: 28 additions & 8 deletions src/nccl_ofi_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,37 @@
#include "nccl_ofi_cuda.h"
#include "nccl_ofi_log.h"

CUresult (*nccl_net_ofi_cuDriverGetVersion)(int *driverVersion) = NULL;
CUresult (*nccl_net_ofi_cuPointerGetAttribute)(void *data, CUpointer_attribute attribute, CUdeviceptr ptr) = NULL;
CUresult (*nccl_net_ofi_cuCtxGetDevice)(CUdevice *device) = NULL;
CUresult (*nccl_net_ofi_cuDeviceGetCount)(int *count) = NULL;
static CUresult (*nccl_net_ofi_cuDriverGetVersion)(int *driverVersion) = NULL;
static CUresult (*nccl_net_ofi_cuPointerGetAttribute)(void *data, CUpointer_attribute attribute, CUdeviceptr ptr) = NULL;
static CUresult (*nccl_net_ofi_cuCtxGetDevice)(CUdevice *device) = NULL;
static CUresult (*nccl_net_ofi_cuDeviceGetCount)(int *count) = NULL;
#if CUDA_VERSION >= 11030
CUresult (*nccl_net_ofi_cuFlushGPUDirectRDMAWrites)(CUflushGPUDirectRDMAWritesTarget target,
CUflushGPUDirectRDMAWritesScope scope) = NULL;
static CUresult (*nccl_net_ofi_cuFlushGPUDirectRDMAWrites)(CUflushGPUDirectRDMAWritesTarget target,
CUflushGPUDirectRDMAWritesScope scope) = NULL;
#else
void *nccl_net_ofi_cuFlushGPUDirectRDMAWrites = NULL;
#endif

int nccl_net_ofi_gpuDriverGetVersion(int *driverVersion) {
return nccl_net_ofi_cuDriverGetVersion(driverVersion) == CUDA_SUCCESS ? GPU_SUCCESS : GPU_ERROR;
}

int nccl_net_ofi_gpuCtxGetDevice(int *device) {
return nccl_net_ofi_cuCtxGetDevice((CUdevice *)device) == CUDA_SUCCESS ? GPU_SUCCESS : GPU_ERROR;
}

int nccl_net_ofi_gpuDeviceGetCount(int *count) {
return nccl_net_ofi_cuDeviceGetCount(count) == CUDA_SUCCESS ? GPU_SUCCESS : GPU_ERROR;
}

#if CUDA_VERSION >= 11030
int nccl_net_ofi_gpuFlushGPUDirectRDMAWrites() {
return nccl_net_ofi_cuFlushGPUDirectRDMAWrites(CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX,
CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER) ==
CUDA_SUCCESS ? GPU_SUCCESS : GPU_ERROR;
}
#endif

#define STRINGIFY(sym) # sym

#define LOAD_SYM(sym) \
Expand All @@ -34,7 +54,7 @@ void *nccl_net_ofi_cuFlushGPUDirectRDMAWrites = NULL;
}

int
nccl_net_ofi_cuda_init(void)
nccl_net_ofi_gpu_init(void)
{
int ret = 0;
void *cudadriver_lib = NULL;
Expand Down Expand Up @@ -77,7 +97,7 @@ int nccl_net_ofi_get_cuda_device(void *data, int *dev_id)
CUresult cuda_ret_mem = nccl_net_ofi_cuPointerGetAttribute(&mem_type,
CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
(CUdeviceptr) data);
CUresult cuda_ret_dev = nccl_net_ofi_cuPointerGetAttribute(&device_ordinal,
CUresult cuda_ret_dev = nccl_net_ofi_cuPointerGetAttribute(&device_ordinal,
CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
(CUdeviceptr) data);

Expand Down
Loading
Loading