ICLDisco · bosilca · Aug 5, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 8, 2024
@@ -170,6 +170,9 @@ option(PARSEC_DIST_WITH_MPI
 if(PARSEC_DIST_WITH_MPI AND 0)
   message(FATAL_ERROR "PARSEC_DIST_WITH_MPI and PARSEC_DIST_WITH_OTHER are mutually exclusive, please select only one")
 endif()
+option(PARSEC_MPI_IS_GPU_AWARE
+       "Build PaRSEC assuming the MPI library is GPU-aware, aka. can move data directly to and from GPU memory.\
+       As of today (mid 2024) while most MPI support such an option, they require a single process per GPU" ON)
 option(PARSEC_DIST_THREAD
   "Use an extra thread to progress the data movements" ON)
 option(PARSEC_DIST_PRIORITIES

@@ -103,7 +103,8 @@ cat <<EOF
                 use the MPI communication library [installed in DIR] (default=autodetect)
 --enable-collectives
                 use asynchronous dataflow collective communication
-
+--enable-mpi-gpu-aware
+                assume the MPI communication library can send/receive from GPU data buffers directly
 
 --with-cuda[=DIR]
                 use the CUDA accelerator libray [installed in DIR] (default=autodetect)
@@ -236,6 +237,8 @@ while [ "x$1" != x ]; do
         --without-mpi) with_mpi=no; shift;;
         --enable-collectives) enable_collectives=yes; shift;;
         --disable-collectives) enable_collectives=no; shift;;
+        --enable-mpi-gpu-aware) enable_mpi_gpu_aware=yes; shift;;
+        --disable-mpi-gpu-aware) enable_mpi_gpu_aware=no; shift;;
 
 # Hwloc options
         --with-hwloc=*) with_hwloc="${1#*=}"; shift;;
@@ -523,7 +526,8 @@ x) ;;
 esac
 [ x$enable_collectives = xyes ] && CMAKE_DEFINES+=" -DPARSEC_DIST_COLLECTIVES=ON"
 [ x$enable_collectives = xno ] && CMAKE_DEFINES+=" -DPARSEC_DIST_COLLECTIVES=OFF"
-
+[ x$enable_mpi_gpu_aware = xyes ] && CMAKE_DEFINES+=" -DPARSEC_MPI_IS_GPU_AWARE=ON"
+[ x$enable_mpi_gpu_aware = xno ] && CMAKE_DEFINES+=" -DPARSEC_MPI_IS_GPU_AWARE=OFF"
 
 case x$with_cuda in
 xno) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_CUDA=OFF";;

@@ -238,6 +238,7 @@ if( BUILD_PARSEC )
     $<$<BOOL:${PARSEC_HAVE_OTF2}>:OTF2::OTF2>
     $<$<BOOL:${MPI_C_FOUND}>:MPI::MPI_C>
     $<$<BOOL:${PARSEC_HAVE_CUDA}>:CUDA::cudart>
+    $<$<BOOL:${PARSEC_HAVE_CUDA}>:cuda>
     $<$<BOOL:${PARSEC_HAVE_HIP}>:hip::host>
     ${EXTRA_LIBS}
     INTERFACE

@@ -235,43 +235,118 @@ int  parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
     return PARSEC_SUCCESS;
 }
 
-parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
-                                          size_t count, int device,
-                                          parsec_datatype_t dtt)
+#include "parsec/utils/zone_malloc.h"
+#include "mca/device/device_gpu.h"
+
+#if defined(PARSEC_DEBUG)
+static int64_t parsec_countable_incoming_message = 0xF000000000000000;
+#endif  /* defined(PARSEC_DEBUG) */
+
+static inline parsec_data_copy_t *
+parsec_arena_internal_copy_new(parsec_arena_t *arena,
+                               parsec_data_t *data,
+                               size_t count, int device,
+                               parsec_datatype_t dtt)
 {
-    parsec_data_t *data;
-    parsec_data_copy_t *copy;
-    int rc;
-
-
-    data = parsec_data_new();
+    parsec_data_copy_t *copy = NULL;
+    parsec_data_t* ldata = data;
     if( NULL == data ) {
+        ldata = parsec_data_new();
+        if( NULL == ldata ) {
+            return NULL;
+        }
+#if defined(PARSEC_DEBUG)
+        /* Name the data with a default key to facilitate debuging */
+        ldata->key = (uint64_t)parsec_atomic_fetch_inc_int64(&parsec_countable_incoming_message);
+        ldata->key |= ((uint64_t)device) << 56;
+#endif  /* defined(PARSEC_DEBUG) */
+    }
+    if( 0 == device ) {
+        copy = parsec_data_copy_new(ldata, device, dtt,
+                                    PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED | PARSEC_DATA_FLAG_ARENA);
+        if (NULL == copy) {
+            goto free_and_return;
+        }
+        int rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
+        if (PARSEC_SUCCESS != rc) {
+            goto free_and_return;
+        }
+        return copy;
+    }
+    /**
+     * This part is not really nice, it breaks the separation between devices, and how their memory is
+     * managed. But, it should give nice perfromance improvements if the communication layer is
+     * capable of sending or receiving data directly to and from the accelerator memory. The only drawback
+     * is that once the GPU memory is full, this will fail, so the soeftware will fall back to the
+     * prior behavior, going through the CPU memory.
+     *
+     * The zone deallocation is not symmetric, it will happen in the GPU management, when the data copies
+     * are released from the different LRU lists.
+     */
+    parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)parsec_mca_device_get(device);
+    if (NULL == gpu_device) {
         return NULL;
     }
+    size_t size = count * arena->elem_size;
+    void* device_private = zone_malloc(gpu_device->memory, size);
+    if( NULL == device_private ) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed (out of memory)\n",
+                             device, size, (void *)copy->arena_chunk);
+        goto free_and_return;
+    }
+    copy = parsec_data_copy_new(ldata, device, dtt,
+                                PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED);
+    if (NULL == copy) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed to allocate copy (out of memory)\n",
+                             device, size, (void *)copy->arena_chunk);
+        zone_free(gpu_device->memory, device_private);
+        goto free_and_return;
+    }
+    copy->dtt = dtt;
+    copy->device_private = device_private;
+    copy->arena_chunk = (parsec_arena_chunk_t*)gpu_device->memory;
+    PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p, "
+                                                  "data ptr %p",
+                         device, size, (void*)copy->arena_chunk, (void*)copy->device_private);
+    copy->version = 0;
+    copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
+    copy->original->owner_device = device;
+    copy->original->preferred_device = device;
+    return copy;
+  free_and_return:
+    if( NULL != copy )
+        PARSEC_OBJ_RELEASE(copy);
+    if( NULL == data)
+        PARSEC_OBJ_RELEASE(ldata); /* release the locally allocated data */
+    return NULL;
+}
 
-    copy = parsec_data_copy_new( data, device, dtt,
-                                 PARSEC_DATA_FLAG_ARENA |
-                                 PARSEC_DATA_FLAG_PARSEC_OWNED |
-                                 PARSEC_DATA_FLAG_PARSEC_MANAGED);
+parsec_data_copy_t *
+parsec_arena_get_new_copy(parsec_arena_t *arena,
+                          size_t count, int device,
+                          parsec_datatype_t dtt)
+{
+    parsec_data_copy_t *dev0_copy, *copy;
 
-    if(NULL == copy) {
-        PARSEC_OBJ_RELEASE(data);
+    dev0_copy = parsec_arena_internal_copy_new(arena, NULL, count, 0 /* first allocate the copy on the device 0 */, dtt);
+    if( NULL == dev0_copy ) {
         return NULL;
     }
+    dev0_copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
+    dev0_copy->version = 0;  /* start from somewhere */
+    if( 0 == device ) {
+        return dev0_copy;
+    }
 
-    rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
-
+    copy = parsec_arena_internal_copy_new(arena, dev0_copy->original, count, device, dtt);
+    if( NULL == copy ) {
+        copy = dev0_copy;  /* return the main memory data copy */
+    }
     /* This data is going to be released once all copies are released
      * It does not exist without at least a copy, and we don't give the
      * pointer to the user, so we must remove our retain from it
      */
-    PARSEC_OBJ_RELEASE(data);
-
-    if( PARSEC_SUCCESS != rc ) {
-        PARSEC_OBJ_RELEASE(copy);
-        return NULL;
-    }
-
+    PARSEC_OBJ_RELEASE(dev0_copy->original);
     return copy;
 }
 

@@ -133,15 +133,15 @@ int parsec_arena_construct_ex(parsec_arena_t* arena,
  *   enough resource to allocate a new data copy of this type.
  */
 
-parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
-                                          size_t count, int device,
-                                          parsec_datatype_t dtt);
+parsec_data_copy_t *parsec_arena_get_new_copy(parsec_arena_t *arena,
+                                              size_t count, int device,
+                                              parsec_datatype_t dtt);
 
 /**
  * @brief Allocates memory for a given data copy. This is a function used by
  *  DSLs to set the memory associated with a data copy they have created.
- *  It is also used by parsec_arena_get_copy.
- * 
+ *  It is also used by parsec_arena_get_new_copy.
+ *
  * @param copy the (empty) data copy to allocate memory for. NB: the @p original
  *  field of this data copy must be set. The operation overwrites the device
  *  dtt and count of this data copy, as well as the device_private pointer.

@@ -313,6 +313,8 @@ void *parsec_info_get(parsec_info_object_array_t *oa, parsec_info_id_t iid)
     if(NULL == ie->constructor)
         return ret;
     nio = ie->constructor(oa->cons_obj, ie->cons_data);
+    if( NULL == nio )
+        return ret;
     ret = parsec_info_test_and_set(oa, iid, nio, NULL);
     if(ret != nio && NULL != ie->destructor) {
         ie->destructor(nio, ie->des_data);