Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Topic/cuda aware communications #671

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ option(PARSEC_DIST_WITH_MPI
if(PARSEC_DIST_WITH_MPI AND 0)
message(FATAL_ERROR "PARSEC_DIST_WITH_MPI and PARSEC_DIST_WITH_OTHER are mutually exclusive, please select only one")
endif()
option(PARSEC_MPI_IS_GPU_AWARE
"Build PaRSEC assuming the MPI library is GPU-aware, aka. can move data directly to and from GPU memory.\
As of today (mid 2024) while most MPI support such an option, they require a single process per GPU" ON)
option(PARSEC_DIST_THREAD
"Use an extra thread to progress the data movements" ON)
option(PARSEC_DIST_PRIORITIES
Expand Down
8 changes: 6 additions & 2 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ cat <<EOF
use the MPI communication library [installed in DIR] (default=autodetect)
--enable-collectives
use asynchronous dataflow collective communication

--enable-mpi-gpu-aware
assume the MPI communication library can send/receive from GPU data buffers directly

--with-cuda[=DIR]
use the CUDA accelerator libray [installed in DIR] (default=autodetect)
Expand Down Expand Up @@ -236,6 +237,8 @@ while [ "x$1" != x ]; do
--without-mpi) with_mpi=no; shift;;
--enable-collectives) enable_collectives=yes; shift;;
--disable-collectives) enable_collectives=no; shift;;
--enable-mpi-gpu-aware) enable_mpi_gpu_aware=yes; shift;;
--disable-mpi-gpu-aware) enable_mpi_gpu_aware=no; shift;;

# Hwloc options
--with-hwloc=*) with_hwloc="${1#*=}"; shift;;
Expand Down Expand Up @@ -523,7 +526,8 @@ x) ;;
esac
[ x$enable_collectives = xyes ] && CMAKE_DEFINES+=" -DPARSEC_DIST_COLLECTIVES=ON"
[ x$enable_collectives = xno ] && CMAKE_DEFINES+=" -DPARSEC_DIST_COLLECTIVES=OFF"

[ x$enable_mpi_gpu_aware = xyes ] && CMAKE_DEFINES+=" -DPARSEC_MPI_IS_GPU_AWARE=ON"
[ x$enable_mpi_gpu_aware = xno ] && CMAKE_DEFINES+=" -DPARSEC_MPI_IS_GPU_AWARE=OFF"

case x$with_cuda in
xno) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_CUDA=OFF";;
Expand Down
1 change: 1 addition & 0 deletions parsec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ if( BUILD_PARSEC )
$<$<BOOL:${PARSEC_HAVE_OTF2}>:OTF2::OTF2>
$<$<BOOL:${MPI_C_FOUND}>:MPI::MPI_C>
$<$<BOOL:${PARSEC_HAVE_CUDA}>:CUDA::cudart>
$<$<BOOL:${PARSEC_HAVE_CUDA}>:cuda>
$<$<BOOL:${PARSEC_HAVE_HIP}>:hip::host>
${EXTRA_LIBS}
INTERFACE
Expand Down
123 changes: 99 additions & 24 deletions parsec/arena.c
Original file line number Diff line number Diff line change
Expand Up @@ -235,43 +235,118 @@ int parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
return PARSEC_SUCCESS;
}

parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt)
#include "parsec/utils/zone_malloc.h"
#include "mca/device/device_gpu.h"

#if defined(PARSEC_DEBUG)
static int64_t parsec_countable_incoming_message = 0xF000000000000000;
#endif /* defined(PARSEC_DEBUG) */

static inline parsec_data_copy_t *
parsec_arena_internal_copy_new(parsec_arena_t *arena,
parsec_data_t *data,
size_t count, int device,
parsec_datatype_t dtt)
{
parsec_data_t *data;
parsec_data_copy_t *copy;
int rc;


data = parsec_data_new();
parsec_data_copy_t *copy = NULL;
parsec_data_t* ldata = data;
if( NULL == data ) {
ldata = parsec_data_new();
if( NULL == ldata ) {
return NULL;
}
#if defined(PARSEC_DEBUG)
/* Name the data with a default key to facilitate debuging */
ldata->key = (uint64_t)parsec_atomic_fetch_inc_int64(&parsec_countable_incoming_message);
ldata->key |= ((uint64_t)device) << 56;
#endif /* defined(PARSEC_DEBUG) */
}
if( 0 == device ) {
copy = parsec_data_copy_new(ldata, device, dtt,
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED | PARSEC_DATA_FLAG_ARENA);
if (NULL == copy) {
goto free_and_return;
}
int rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
if (PARSEC_SUCCESS != rc) {
goto free_and_return;
}
return copy;
}
/**
* This part is not really nice, it breaks the separation between devices, and how their memory is
* managed. But, it should give nice perfromance improvements if the communication layer is
* capable of sending or receiving data directly to and from the accelerator memory. The only drawback
* is that once the GPU memory is full, this will fail, so the soeftware will fall back to the
* prior behavior, going through the CPU memory.
*
* The zone deallocation is not symmetric, it will happen in the GPU management, when the data copies
* are released from the different LRU lists.
*/
parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)parsec_mca_device_get(device);
if (NULL == gpu_device) {
return NULL;
}
size_t size = count * arena->elem_size;
void* device_private = zone_malloc(gpu_device->memory, size);
if( NULL == device_private ) {
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed (out of memory)\n",
device, size, (void *)copy->arena_chunk);
goto free_and_return;
}
copy = parsec_data_copy_new(ldata, device, dtt,
PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED);
if (NULL == copy) {
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed to allocate copy (out of memory)\n",
device, size, (void *)copy->arena_chunk);
zone_free(gpu_device->memory, device_private);
goto free_and_return;
}
copy->dtt = dtt;
copy->device_private = device_private;
copy->arena_chunk = (parsec_arena_chunk_t*)gpu_device->memory;
PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p, "
"data ptr %p",
device, size, (void*)copy->arena_chunk, (void*)copy->device_private);
copy->version = 0;
copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
copy->original->owner_device = device;
copy->original->preferred_device = device;
return copy;
free_and_return:
if( NULL != copy )
PARSEC_OBJ_RELEASE(copy);
if( NULL == data)
PARSEC_OBJ_RELEASE(ldata); /* release the locally allocated data */
return NULL;
}

copy = parsec_data_copy_new( data, device, dtt,
PARSEC_DATA_FLAG_ARENA |
PARSEC_DATA_FLAG_PARSEC_OWNED |
PARSEC_DATA_FLAG_PARSEC_MANAGED);
parsec_data_copy_t *
parsec_arena_get_new_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt)
{
parsec_data_copy_t *dev0_copy, *copy;

if(NULL == copy) {
PARSEC_OBJ_RELEASE(data);
dev0_copy = parsec_arena_internal_copy_new(arena, NULL, count, 0 /* first allocate the copy on the device 0 */, dtt);
if( NULL == dev0_copy ) {
return NULL;
}
dev0_copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
dev0_copy->version = 0; /* start from somewhere */
if( 0 == device ) {
return dev0_copy;
}

rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);

copy = parsec_arena_internal_copy_new(arena, dev0_copy->original, count, device, dtt);
if( NULL == copy ) {
copy = dev0_copy; /* return the main memory data copy */
}
/* This data is going to be released once all copies are released
* It does not exist without at least a copy, and we don't give the
* pointer to the user, so we must remove our retain from it
*/
PARSEC_OBJ_RELEASE(data);

if( PARSEC_SUCCESS != rc ) {
PARSEC_OBJ_RELEASE(copy);
return NULL;
}

PARSEC_OBJ_RELEASE(dev0_copy->original);
return copy;
}

Expand Down
10 changes: 5 additions & 5 deletions parsec/arena.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,15 @@ int parsec_arena_construct_ex(parsec_arena_t* arena,
* enough resource to allocate a new data copy of this type.
*/

parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt);
parsec_data_copy_t *parsec_arena_get_new_copy(parsec_arena_t *arena,
size_t count, int device,
parsec_datatype_t dtt);

/**
* @brief Allocates memory for a given data copy. This is a function used by
* DSLs to set the memory associated with a data copy they have created.
* It is also used by parsec_arena_get_copy.
*
* It is also used by parsec_arena_get_new_copy.
*
* @param copy the (empty) data copy to allocate memory for. NB: the @p original
* field of this data copy must be set. The operation overwrites the device
* dtt and count of this data copy, as well as the device_private pointer.
Expand Down
2 changes: 2 additions & 0 deletions parsec/class/info.c
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,8 @@ void *parsec_info_get(parsec_info_object_array_t *oa, parsec_info_id_t iid)
if(NULL == ie->constructor)
return ret;
nio = ie->constructor(oa->cons_obj, ie->cons_data);
if( NULL == nio )
return ret;
ret = parsec_info_test_and_set(oa, iid, nio, NULL);
if(ret != nio && NULL != ie->destructor) {
ie->destructor(nio, ie->des_data);
Expand Down
Loading
Loading