From c12fb4400cee8397d6917280109b55bf22042f4c Mon Sep 17 00:00:00 2001
From: Joseph John <joseph.john@anu.edu.au>
Date: Wed, 7 Jun 2023 17:41:43 +1000
Subject: [PATCH] GPU Task progression using callback function implemented.

The Callback function  for a stream implemented using cudaLaunchHostFunc().
The callback function stream will push the tasks to the next stream.

This replaces the cuda event based task progression.
---
 parsec/mca/device/cuda/device_cuda_module.c | 555 +++++++++++---------
 parsec/mca/device/device_gpu.h              |  10 +
 2 files changed, 311 insertions(+), 254 deletions(-)

diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c
index c61132fa8..b0796d3fb 100644
--- a/parsec/mca/device/cuda/device_cuda_module.c
+++ b/parsec/mca/device/cuda/device_cuda_module.c
@@ -368,6 +368,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     gpu_device->data_avail_epoch = 0;
 
     gpu_device->max_exec_streams = parsec_cuda_max_streams;
+    gpu_device->last_exec_stream_index = 2; /** starting index of the execution stream */
     gpu_device->exec_stream =
         (parsec_gpu_exec_stream_t**)malloc(gpu_device->max_exec_streams * sizeof(parsec_gpu_exec_stream_t*));
     // To reduce the number of separate malloc, we allocate all the streams in a single block, stored in exec_stream[0]
@@ -399,6 +400,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
         exec_stream->start        = 0;
         exec_stream->end          = 0;
         exec_stream->name         = NULL;
+        exec_stream->active_event_count = 0; 
         exec_stream->fifo_pending = (parsec_list_t*)PARSEC_OBJ_NEW(parsec_list_t);
         PARSEC_OBJ_CONSTRUCT(exec_stream->fifo_pending, parsec_list_t);
         exec_stream->tasks    = (parsec_gpu_task_t**)malloc(exec_stream->max_events
@@ -477,6 +479,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_lru,       parsec_list_t);
     PARSEC_OBJ_CONSTRUCT(&gpu_device->gpu_mem_owned_lru, parsec_list_t);
     PARSEC_OBJ_CONSTRUCT(&gpu_device->pending,           parsec_fifo_t);
+    PARSEC_OBJ_CONSTRUCT(&gpu_device->complete_queue,    parsec_fifo_t);
 
     gpu_device->sort_starting_p = NULL;
     gpu_device->peer_access_mask = 0;  /* No GPU to GPU direct transfer by default */
@@ -569,6 +572,7 @@ parsec_cuda_module_fini(parsec_device_module_t* device)
 
     /* Release pending queue */
     PARSEC_OBJ_DESTRUCT(&gpu_device->pending);
+    PARSEC_OBJ_DESTRUCT(&gpu_device->complete_queue);
 
     /* Release all streams */
     for( j = 0; j < gpu_device->num_exec_streams; j++ ) {
@@ -1948,6 +1952,56 @@ parsec_gpu_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
     return 0;
 }
 
+void stream_cb_fn(void *data)
+{
+    int rc = 0, last_exec_stream_index = 0;
+    stream_cb_data_t *stream_cb_data = (stream_cb_data_t *)data;
+
+#if defined(PARSEC_DEBUG_NOISIER)
+    char task_str[MAX_TASK_STRLEN];
+#endif
+
+    parsec_device_gpu_module_t* gpu_device = stream_cb_data->gpu_device;
+    parsec_gpu_task_t* gpu_task = stream_cb_data->gpu_task;
+    parsec_gpu_exec_stream_t* current_stream = stream_cb_data->current_stream;
+
+    rc = PARSEC_HOOK_RETURN_DONE;
+    if (gpu_task->complete_stage) {
+        rc = gpu_task->complete_stage(gpu_device, &gpu_task, current_stream);
+        //TODO: What happens if complete fails
+    }
+    assert(rc == PARSEC_HOOK_RETURN_DONE);
+
+    /** stage-in complete */
+    if( current_stream == gpu_device->exec_stream[0]) {
+        /** move the task to the execution queues in a round robin fashion */
+        last_exec_stream_index = 2 + (gpu_device->last_exec_stream_index + 1) % (gpu_device->num_exec_streams - 2);
+        gpu_device->last_exec_stream_index = last_exec_stream_index;
+        PARSEC_PUSH_TASK(gpu_device->exec_stream[last_exec_stream_index]->fifo_pending, (parsec_list_item_t*)gpu_task);
+    }
+    /** stage-out complete */
+    else if( current_stream == gpu_device->exec_stream[1]) {
+
+        /** move the task to complete queue */
+        parsec_fifo_push( &(gpu_device->complete_queue), (parsec_list_item_t*)gpu_task );
+    }
+    /** execution complete */  
+    else {
+        /** move the task to the stage-out queue */
+        PARSEC_PUSH_TASK(gpu_device->exec_stream[1]->fifo_pending, (parsec_list_item_t*)gpu_task);
+    }
+
+    PARSEC_DEBUG_VERBOSE(19, parsec_gpu_output_stream,
+        "GPU[%s]: Completed %s priority %d on stream %s{%p}",
+        gpu_device->super.name,
+        parsec_task_snprintf(task_str, MAX_TASK_STRLEN, gpu_task->ec),
+        gpu_task->ec->priority, current_stream->name, (void*)current_stream);
+
+    /** update the number of pending active events on the stream */
+    current_stream->active_event_count--;
+    free(data);
+} 
+
 /**
  * This function tries to progress a stream, by picking up a ready task
  * and applying the progress function. The task to be progresses is
@@ -1968,129 +2022,83 @@ static inline int
 progress_stream( parsec_device_gpu_module_t* gpu_device,
                  parsec_gpu_exec_stream_t* stream,
                  parsec_advance_task_function_t upstream_progress_fct,
-                 parsec_gpu_task_t* task,
                  parsec_gpu_task_t** out_task )
 {
     parsec_advance_task_function_t progress_fct;
+    parsec_gpu_task_t* task = NULL;
     int saved_rc = 0, rc;
 #if defined(PARSEC_DEBUG_NOISIER)
     char task_str[MAX_TASK_STRLEN];
 #endif
     parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)stream;
 
-    /* We always handle the tasks in order. Thus if we got a new task, add it to the
-     * local list (possibly by reordering the list). Also, as we can return a single
-     * task first try to see if anything completed. */
-    if( NULL != task ) {
-        PARSEC_PUSH_TASK(stream->fifo_pending, (parsec_list_item_t*)task);
-        task = NULL;
-    }
+    /** make sure out_task is populated only in case something fails */
     *out_task = NULL;
     progress_fct = upstream_progress_fct;
 
-    if( NULL != stream->tasks[stream->end] ) {
-        rc = cudaEventQuery(cuda_stream->events[stream->end]);
-        if( cudaSuccess == rc ) {
-            /* Save the task for the next step */
-            task = *out_task = stream->tasks[stream->end];
-            PARSEC_DEBUG_VERBOSE(19, parsec_gpu_output_stream,
-                                 "GPU[%s]: Completed %s priority %d on stream %s{%p}",
-                                 gpu_device->super.name,
-                                 parsec_task_snprintf(task_str, MAX_TASK_STRLEN, task->ec),
-                                 task->ec->priority, stream->name, (void*)stream);
-            stream->tasks[stream->end]    = NULL;
-            stream->end = (stream->end + 1) % stream->max_events;
+    while(stream->active_event_count < stream->max_events)
+    {
+        task = (parsec_gpu_task_t*)parsec_list_pop_front(stream->fifo_pending);  /* get the best task */
 
-#if defined(PARSEC_PROF_TRACE)
-            if( stream->prof_event_track_enable ) {
-                if( task->prof_key_end != -1 ) {
-                    PARSEC_PROFILING_TRACE(stream->profiling, task->prof_key_end, task->prof_event_id, task->prof_tp_id, NULL);
-                } 
-            } 
-#endif /* (PARSEC_PROF_TRACE) */
-
-            rc = PARSEC_HOOK_RETURN_DONE;
-            if (task->complete_stage)
-                rc = task->complete_stage(gpu_device, out_task, stream);
-            /* the task can be withdrawn by the system */
-            return rc;
+        if( NULL == task ) {  /* No tasks, we're done */
+            return saved_rc;
         }
-        if( cudaErrorNotReady != rc ) {
-            PARSEC_CUDA_CHECK_ERROR( "(progress_stream) cudaEventQuery ", rc,
-                                     {return PARSEC_HOOK_RETURN_AGAIN;} );
+
+        stream->active_event_count++;
+        PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t*)task);
+
+        if ( NULL == upstream_progress_fct ) {
+            /* Grab the submit function */
+            progress_fct = task->submit;
+        #if defined(PARSEC_DEBUG_PARANOID)
+            int i;
+            const parsec_flow_t *flow;
+            for( i = 0; i < task->ec->task_class->nb_flows; i++ ) {
+                /* Make sure data_in is not NULL */
+                if( NULL == task->ec->data[i].data_in ) continue;
+
+                flow = task->flow[i];
+                if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
+                if( 0 == (task->ec->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
+                assert(task->ec->data[i].data_out->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER);
+            }
+        #endif /* defined(PARSEC_DEBUG_PARANOID) */
         }
-    }
 
- grab_a_task:
-    if( NULL == stream->tasks[stream->start] ) {  /* there is room on the stream */
-        task = (parsec_gpu_task_t*)parsec_list_pop_front(stream->fifo_pending);  /* get the best task */
-    }
-    if( NULL == task ) {  /* No tasks, we're done */
-        return saved_rc;
-    }
-    PARSEC_LIST_ITEM_SINGLETON((parsec_list_item_t*)task);
+        rc = progress_fct( gpu_device, task, stream );
+        if( 0 > rc ) {
+            if( PARSEC_HOOK_RETURN_AGAIN != rc &&
+                PARSEC_HOOK_RETURN_ASYNC != rc ) {
+                *out_task = task;
+                return rc;
+            }
 
-    assert( NULL == stream->tasks[stream->start] );
-    /**
-     * In case the task is successfully progressed, the corresponding profiling
-     * event is triggered.
-     */
-    if ( NULL == upstream_progress_fct ) {
-        /* Grab the submit function */
-        progress_fct = task->submit;
-#if defined(PARSEC_DEBUG_PARANOID)
-        int i;
-        const parsec_flow_t *flow;
-        for( i = 0; i < task->ec->task_class->nb_flows; i++ ) {
-            /* Make sure data_in is not NULL */
-            if( NULL == task->ec->data[i].data_in ) continue;
-
-            flow = task->flow[i];
-            if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
-            if( 0 == (task->ec->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
-            assert(task->ec->data[i].data_out->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER);
-        }
-#endif /* defined(PARSEC_DEBUG_PARANOID) */
-    }
-    rc = progress_fct( gpu_device, task, stream );
-    if( 0 > rc ) {
-        if( PARSEC_HOOK_RETURN_AGAIN != rc &&
-            PARSEC_HOOK_RETURN_ASYNC != rc ) {
-            *out_task = task;
-            return rc;
+            if( PARSEC_HOOK_RETURN_ASYNC == rc ) {
+                PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
+                                     "GPU[%s]: GPU task %p has been removed by the progress function",
+                                     gpu_device->super.name, (void*)task);
+            } else {
+                parsec_fifo_push(stream->fifo_pending, (parsec_list_item_t*)task);
+                PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
+                                     "GPU[%s]: Reschedule task %p: no room available on the GPU for data",
+                                     gpu_device->super.name, (void*)task->ec);
+            }
+            *out_task = NULL;
+            return PARSEC_HOOK_RETURN_DONE;
         }
 
-        if( PARSEC_HOOK_RETURN_ASYNC == rc ) {
-            PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
-                                 "GPU[%s]: GPU task %p has been removed by the progress function",
-                                 gpu_device->super.name, (void*)task);
-        } else {
-            parsec_fifo_push(stream->fifo_pending, (parsec_list_item_t*)task);
-            PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
-                                 "GPU[%s]: Reschedule task %p: no room available on the GPU for data",
-                                 gpu_device->super.name, (void*)task->ec);
-        }
-        *out_task = NULL;
-        return PARSEC_HOOK_RETURN_DONE;
-    }
-    /**
-     * Do not skip the cuda event generation. The problem is that some of the inputs
-     * might be in the pipe of being transferred to the GPU. If we activate this task
-     * too early, it might get executed before the data is available on the GPU.
-     * Obviously, this lead to incorrect results.
-     */
-    rc = cudaEventRecord( cuda_stream->events[stream->start], cuda_stream->cuda_stream );
-    assert(cudaSuccess == rc);
-    stream->tasks[stream->start]    = task;
-    stream->start = (stream->start + 1) % stream->max_events;
-    PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
-                         "GPU[%s]: Submitted %s(task %p) priority %d on stream %s{%p}",
-                         gpu_device->super.name,
-                         task->ec->task_class->name, (void*)task->ec, task->ec->priority,
-                         stream->name, (void*)stream);
+        /** create the callback data for stream to invoke once the task is complete */
+        stream_cb_data_t *stream_cb_data = malloc(sizeof(stream_cb_data_t));
+        stream_cb_data->gpu_device = gpu_device;
+        stream_cb_data->gpu_task = task;
+        stream_cb_data->current_stream = stream;
+        task->stream_cb_data = stream_cb_data;
+        /** The stream_cb_fn function is called with parameter stream_cb_data
+         * once preceding progress_fct() operation is completed by the stream */
+        cudaLaunchHostFunc(cuda_stream->cuda_stream , stream_cb_fn, stream_cb_data);
 
-    task = NULL;
-    goto grab_a_task;
+        task = NULL;
+    }
 }
 
 /**
@@ -2532,6 +2540,60 @@ parsec_cuda_kernel_cleanout( parsec_device_gpu_module_t *gpu_device,
     return 0;
 }
 
+/**
+ * Remove the task from the device count. 
+ * Return value 0 no more task in the device
+*/
+int remove_gpu_task(parsec_execution_stream_t *es, parsec_device_gpu_module_t *gpu_device, 
+    parsec_gpu_task_t *gpu_task)
+{
+    int rc = -1;
+
+    parsec_atomic_fetch_add_int64(&gpu_device->super.device_load, -gpu_task->load);
+    rc = parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) );
+
+    if( 1 == rc ) {  /* I was the last one */
+
+    #if defined(PARSEC_PROF_TRACE)
+        if( parsec_gpu_trackable_events & PARSEC_PROFILE_GPU_TRACK_OWN )
+            PARSEC_PROFILING_TRACE( es->es_profile, parsec_gpu_own_GPU_key_end,
+            (unsigned long)es, PROFILE_OBJECT_ID_NULL, NULL );
+    #endif  /* defined(PARSEC_PROF_TRACE) */
+
+        PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: Leaving GPU management at %s:%d", 
+            gpu_device->super.name, __FILE__, __LINE__);
+           
+        return 0;
+    }
+
+    return 1;
+
+}
+
+/** Forcibly move the data item of the task from the GPU */
+int get_data_out_of_device(parsec_execution_stream_t *es, parsec_device_gpu_module_t *gpu_device, 
+    parsec_gpu_task_t *gpu_task)
+{
+
+#if defined(PARSEC_DEBUG_NOISIER)
+    char tmp[MAX_TASK_STRLEN];
+#endif
+
+    if( NULL != gpu_task) {
+
+        /** move the task to stage-out queue */
+        PARSEC_PUSH_TASK(gpu_device->exec_stream[1]->fifo_pending, (parsec_list_item_t*)gpu_task);
+
+        PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,  "GPU[%s]:\tRetrieve data (if any) for %s priority %d", gpu_device->super.name,
+            parsec_task_snprintf(tmp, MAX_TASK_STRLEN, gpu_task->ec),
+            gpu_task->ec->priority );
+
+        return 1;
+    }
+    return 0;
+}
+
+
 /**
  * This version is based on 4 streams: one for transfers from the memory to
  * the GPU, 2 for kernel executions and one for transfers from the GPU into
@@ -2540,6 +2602,7 @@ parsec_cuda_kernel_cleanout( parsec_device_gpu_module_t *gpu_device,
  * been completed. Each type of stream (in, exec and out) has a pending FIFO,
  * where tasks ready to jump to the respective step are waiting.
  */
+
 parsec_hook_return_t
 parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
                               parsec_gpu_task_t         *gpu_task,
@@ -2549,7 +2612,7 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     parsec_device_cuda_module_t *cuda_device;
     cudaError_t status;
     int rc, exec_stream = 0;
-    parsec_gpu_task_t *progress_task, *out_task_submit = NULL, *out_task_pop = NULL;
+    parsec_gpu_task_t *error_task, *out_task_submit = NULL, *out_task_pop = NULL;
 #if defined(PARSEC_DEBUG_NOISIER)
     char tmp[MAX_TASK_STRLEN];
 #endif
@@ -2592,8 +2655,11 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
             nanosleep(&delay, NULL);
         }
     }
+
+    /** always push the task to the pending queue */
+    parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task );
     if( 0 < rc ) {
-        parsec_fifo_push( &(gpu_device->pending), (parsec_list_item_t*)gpu_task );
+        /** return if you are not the manager */
         return PARSEC_HOOK_RETURN_ASYNC;
     }
     PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: Entering GPU management at %s:%d",
@@ -2609,165 +2675,146 @@ parsec_cuda_kernel_scheduler( parsec_execution_stream_t *es,
     PARSEC_CUDA_CHECK_ERROR( "(parsec_cuda_kernel_scheduler) cudaSetDevice ", status,
                              {return PARSEC_HOOK_RETURN_DISABLE;} );
 
- check_in_deps:
-    if( NULL != gpu_task ) {
-        PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
-                             "GPU[%s]:\tUpload data (if any) for %s priority %d",
-                             gpu_device->super.name,
-                             parsec_gpu_describe_gpu_task(tmp, MAX_TASK_STRLEN, gpu_task),
-                             gpu_task->ec->priority );
-    }
-    rc = progress_stream( gpu_device,
-                          gpu_device->exec_stream[0],
-                          parsec_cuda_kernel_push,
-                          gpu_task, &progress_task );
-    if( rc < 0 ) {  /* In case of error progress_task is the task that raised it */
-        if( -1 == rc )
-            goto disable_gpu;
-        /* We are in the early stages, and if there no room on the GPU for a task we need to
-         * delay all retries for the same task for a little while. Meanwhile, put the task back
-         * trigger a device flush, and keep executing tasks that have their data on the device.
-         */
-        if( NULL != progress_task ) {
-            PARSEC_PUSH_TASK(gpu_device->exec_stream[0]->fifo_pending, (parsec_list_item_t*)progress_task);
-            progress_task = NULL;
+    do
+    {
+        parsec_gpu_task_t *progress_task = NULL;
+        
+        progress_task = (parsec_gpu_task_t*)parsec_fifo_try_pop( &(gpu_device->pending) );
+        if( NULL != progress_task) {
+            if( PARSEC_GPU_TASK_TYPE_D2D_COMPLETE == progress_task->task_type ) {
+                get_data_out_of_device(es, gpu_device, progress_task);
+            }
+            else {
+                PARSEC_PUSH_TASK(gpu_device->exec_stream[0]->fifo_pending, (parsec_list_item_t*)progress_task);
+            }
+
+            pop_null = 0;
+            progress_task->last_data_check_epoch = gpu_device->data_avail_epoch - 1;  /* force at least one tour */
+
+            PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,  "GPU[%s]:\tGet from shared queue %s priority %d", gpu_device->super.name,
+                parsec_gpu_describe_gpu_task(tmp, MAX_TASK_STRLEN, progress_task), progress_task->ec->priority);
+            PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
+                "GPU[%s]:\tUpload data (if any) for %s priority %d",
+                gpu_device->super.name,
+                parsec_gpu_describe_gpu_task(tmp, MAX_TASK_STRLEN, progress_task),
+                progress_task->ec->priority );
         }
-        /* If we can extract data go for it, otherwise try to drain the pending tasks */
-        gpu_task = parsec_gpu_create_w2r_task(gpu_device, es);
-        if( NULL != gpu_task )
-            goto get_data_out_of_device;
-    }
-    gpu_task = progress_task;
-
-    /* Stage-in completed for this task: it is ready to be executed */
-    exec_stream = (exec_stream + 1) % (gpu_device->num_exec_streams - 2);  /* Choose an exec_stream */
-    if( NULL != gpu_task ) {
-        PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,  "GPU[%s]:\tExecute %s priority %d", gpu_device->super.name,
-                             parsec_task_snprintf(tmp, MAX_TASK_STRLEN, gpu_task->ec),
-                             gpu_task->ec->priority );
-    }
-    rc = progress_stream( gpu_device,
-                          gpu_device->exec_stream[2+exec_stream],
-                          NULL,
-                          gpu_task, &progress_task );
-    if( rc < 0 ) {
-        if( PARSEC_HOOK_RETURN_DISABLE == rc )
-            goto disable_gpu;
-        if( PARSEC_HOOK_RETURN_ASYNC != rc ) {
-            /* Reschedule the task. As the chore_id has been modified,
-               another incarnation of the task will be executed. */
-            if( NULL != progress_task ) {
-                parsec_cuda_kernel_cleanout(gpu_device, progress_task);
-                __parsec_reschedule(es, progress_task->ec);
-                gpu_task = progress_task;
-                progress_task = NULL;
-                goto remove_gpu_task;
+        else {
+            pop_null++;
+            if( pop_null % 1024 == 1023 ) {
+                PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,  "GPU[%s]:\tStill waiting for %d tasks to execute, but poped NULL the last %d times I tried to pop something...",
+                gpu_device->super.name, gpu_device->mutex, pop_null);
             }
-            gpu_task = NULL;
-            goto fetch_task_from_shared_queue;
         }
-        progress_task = NULL;
-    }
-    gpu_task = progress_task;
-    out_task_submit = progress_task;
 
- get_data_out_of_device:
-    if( NULL != gpu_task ) {  /* This task has completed its execution */
-        PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,  "GPU[%s]:\tRetrieve data (if any) for %s priority %d", gpu_device->super.name,
-                            parsec_task_snprintf(tmp, MAX_TASK_STRLEN, gpu_task->ec),
-                            gpu_task->ec->priority );
-    }
-    /* Task is ready to move the data back to main memory */
-    rc = progress_stream( gpu_device,
-                          gpu_device->exec_stream[1],
-                          parsec_cuda_kernel_pop,
-                          gpu_task, &progress_task );
-    if( rc < 0 ) {
-        if( -1 == rc )
-            goto disable_gpu;
-    }
-    if( NULL != progress_task ) {
-        /* We have a successfully completed task. However, it is not gpu_task, as
-         * it was just submitted into the data retrieval system. Instead, the task
-         * ready to move into the next level is the progress_task.
-         */
-        gpu_task = progress_task;
-        progress_task = NULL;
-        goto complete_task;
-    }
-    gpu_task = progress_task;
-    out_task_pop = progress_task;
+        /** Progress tasks in stage-in queue */
+        rc = progress_stream( gpu_device, gpu_device->exec_stream[0],
+                parsec_cuda_kernel_push, &error_task );
+
+        if( rc < 0 ) {  /* In case of error error_task is the task that raised it */
+            if( -1 == rc ) {
+                /* Something wrong happened. Push all the pending tasks back on the
+                * cores, and disable the gpu.
+                */
+                parsec_warning("Critical issue related to the GPU discovered. Giving up\n");
+                return PARSEC_HOOK_RETURN_DISABLE;
+            }
 
- fetch_task_from_shared_queue:
-    assert( NULL == gpu_task );
-    if (1 == parsec_cuda_sort_pending && out_task_submit == NULL && out_task_pop == NULL) {
-        parsec_gpu_sort_pending_list(gpu_device);
-    }
-    gpu_task = (parsec_gpu_task_t*)parsec_fifo_try_pop( &(gpu_device->pending) );
-    if( NULL != gpu_task ) {
-        pop_null = 0;
-        gpu_task->last_data_check_epoch = gpu_device->data_avail_epoch - 1;  /* force at least one tour */
-        PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,  "GPU[%s]:\tGet from shared queue %s priority %d", gpu_device->super.name,
-                             parsec_gpu_describe_gpu_task(tmp, MAX_TASK_STRLEN, gpu_task),
-                             gpu_task->ec->priority);
-        if( PARSEC_GPU_TASK_TYPE_D2D_COMPLETE == gpu_task->task_type ) {
-            goto get_data_out_of_device;
+            /* We are in the early stages, and if there no room on the GPU for a task we need to
+             * delay all retries for the same task for a little while. Meanwhile, put the task back
+             * trigger a device flush, and keep executing tasks that have their data on the device.
+             */
+            if( NULL != error_task ) {
+                PARSEC_PUSH_TASK(gpu_device->exec_stream[0]->fifo_pending, (parsec_list_item_t*)error_task);
+                error_task = NULL;
+            }
+
+            /* If we can extract data go for it, otherwise try to drain the pending tasks */
+            parsec_gpu_task_t *dummy_task = parsec_gpu_create_w2r_task(gpu_device, es);
+            if( NULL != dummy_task ) {
+                get_data_out_of_device(es, gpu_device, dummy_task);
+            }
         }
-    } else {
-        pop_null++;
-        if( pop_null % 1024 == 1023 ) {
-            PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,  "GPU[%s]:\tStill waiting for %d tasks to execute, but poped NULL the last %d times I tried to pop something...",
-                                 gpu_device->super.name, gpu_device->mutex, pop_null);
+
+        /** Progress tasks in the execution queue */
+        for( exec_stream = 2; exec_stream < gpu_device->num_exec_streams;  exec_stream++) {
+            rc = progress_stream( gpu_device, gpu_device->exec_stream[exec_stream],
+                    NULL, &error_task );
+
+
+            if( rc < 0 ) {
+                if( PARSEC_HOOK_RETURN_DISABLE == rc )
+                {
+                    /* Something wrong happened. Push all the pending tasks back on the
+                    * cores, and disable the gpu.
+                    */
+                    parsec_warning("Critical issue related to the GPU discovered. Giving up\n");
+                    return PARSEC_HOOK_RETURN_DISABLE;
+                }
+
+                if( PARSEC_HOOK_RETURN_ASYNC != rc ) {
+                    /* Reschedule the task. As the chore_id has been modified,
+                    another incarnation of the task will be executed. */
+                    if( NULL != error_task ) {
+                        parsec_cuda_kernel_cleanout(gpu_device, error_task);
+                        __parsec_reschedule(es, error_task->ec);
+
+                        rc = remove_gpu_task(es, gpu_device, error_task);
+                        free( error_task );
+                        if(0 == rc) { /** all tasks completed */
+                            return PARSEC_HOOK_RETURN_ASYNC;   
+                        }
+                    }
+                }
+
+                break;
+            }
         }
-    }
-    goto check_in_deps;
 
- complete_task:
-    assert( NULL != gpu_task );
-    PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,  "GPU[%s]:\tComplete %s",
-                         gpu_device->super.name,
-                         parsec_task_snprintf(tmp, MAX_TASK_STRLEN, gpu_task->ec));
-    /* Everything went fine so far, the result is correct and back in the main memory */
-    PARSEC_LIST_ITEM_SINGLETON(gpu_task);
-    if (gpu_task->task_type == PARSEC_GPU_TASK_TYPE_D2HTRANSFER) {
-        parsec_gpu_complete_w2r_task(gpu_device, gpu_task, es);
-        gpu_task = progress_task;
-        goto fetch_task_from_shared_queue;
-    }
-    if (gpu_task->task_type == PARSEC_GPU_TASK_TYPE_D2D_COMPLETE) {
-        free( gpu_task->ec );
-        gpu_task->ec = NULL;
-        goto remove_gpu_task;
-    }
-    parsec_cuda_kernel_epilog( gpu_device, gpu_task );
-    __parsec_complete_execution( es, gpu_task->ec );
-    gpu_device->super.executed_tasks++;
- remove_gpu_task:
-    parsec_atomic_fetch_add_int64(&gpu_device->super.device_load, -gpu_task->load);
-    PARSEC_DEBUG_VERBOSE(3, parsec_gpu_output_stream,"GPU[%s]: gpu_task %p freed at %s:%d", gpu_device->super.name, 
-                         gpu_task, __FILE__, __LINE__);
-    free( gpu_task );
-    rc = parsec_atomic_fetch_dec_int32( &(gpu_device->mutex) );
-    if( 1 == rc ) {  /* I was the last one */
-#if defined(PARSEC_PROF_TRACE)
-        if( parsec_gpu_trackable_events & PARSEC_PROFILE_GPU_TRACK_OWN )
-            PARSEC_PROFILING_TRACE( es->es_profile, parsec_gpu_own_GPU_key_end,
-                                    (unsigned long)es, PROFILE_OBJECT_ID_NULL, NULL );
-#endif  /* defined(PARSEC_PROF_TRACE) */
-        PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,"GPU[%s]: Leaving GPU management at %s:%d", 
-                             gpu_device->super.name, __FILE__, __LINE__);
+        /** Progress tasks in the stage-out queue */
+        rc = progress_stream( gpu_device, gpu_device->exec_stream[1],
+                parsec_cuda_kernel_pop, &error_task );
 
-        return PARSEC_HOOK_RETURN_ASYNC;
-    }
-    gpu_task = progress_task;
-    goto fetch_task_from_shared_queue;
+        if( rc < 0 ) {
+            if( -1 == rc ) {
+                /* Something wrong happened. Push all the pending tasks back on the
+                 * cores, and disable the gpu.
+                 */
+                parsec_warning("Critical issue related to the GPU discovered. Giving up\n");
+                return PARSEC_HOOK_RETURN_DISABLE;
+            }
+        }
 
- disable_gpu:
-    /* Something wrong happened. Push all the pending tasks back on the
-     * cores, and disable the gpu.
-     */
-    parsec_warning("Critical issue related to the GPU discovered. Giving up\n");
-    return PARSEC_HOOK_RETURN_DISABLE;
+        parsec_gpu_task_t  *gpu_task_to_complete = NULL;
+        gpu_task_to_complete = (parsec_gpu_task_t*)parsec_fifo_try_pop( &(gpu_device->complete_queue));
+
+        if(NULL != gpu_task_to_complete) {
+            PARSEC_LIST_ITEM_SINGLETON(gpu_task_to_complete);
+
+            if (gpu_task_to_complete->task_type == PARSEC_GPU_TASK_TYPE_D2HTRANSFER) {
+                parsec_gpu_complete_w2r_task(gpu_device, gpu_task_to_complete, es);
+            }
+            else {
+
+                if (gpu_task_to_complete->task_type == PARSEC_GPU_TASK_TYPE_D2D_COMPLETE)  {
+                    free( gpu_task_to_complete->ec );
+                    gpu_task_to_complete->ec = NULL;
+                }
+                else /** compute task*/
+                {
+                    parsec_cuda_kernel_epilog( gpu_device, gpu_task_to_complete );
+                    __parsec_complete_execution( es, gpu_task_to_complete->ec );
+                    gpu_device->super.executed_tasks++;
+                }
+
+                rc = remove_gpu_task(es, gpu_device, gpu_task_to_complete);
+                free( gpu_task_to_complete );
+                if(0 == rc) { /** all tasks completed */
+                    return PARSEC_HOOK_RETURN_ASYNC;   
+                }
+            }
+        }
+    } while(1);
 }
 
 #endif /* PARSEC_HAVE_DEV_CUDA_SUPPORT */
diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h
index 9caa15db1..f280e32d9 100644
--- a/parsec/mca/device/device_gpu.h
+++ b/parsec/mca/device/device_gpu.h
@@ -76,6 +76,12 @@ typedef int (parsec_stage_out_function_t)(parsec_gpu_task_t        *gtask,
                                           uint32_t                  flow_mask,
                                           parsec_gpu_exec_stream_t *gpu_stream);
 
+typedef struct stream_cb_data_s {
+    parsec_device_gpu_module_t* gpu_device;
+    parsec_gpu_task_t* gpu_task;
+    parsec_gpu_exec_stream_t* current_stream;
+} stream_cb_data_t;
+
 struct parsec_gpu_task_s {
     parsec_list_item_t               list_item;
     int                              task_type;
@@ -84,6 +90,7 @@ struct parsec_gpu_task_s {
     parsec_complete_stage_function_t complete_stage;
     parsec_stage_in_function_t      *stage_in;
     parsec_stage_out_function_t     *stage_out;
+    stream_cb_data_t                *stream_cb_data;
 #if defined(PARSEC_PROF_TRACE)
     int                              prof_key_end;
     uint64_t                         prof_event_id;
@@ -128,17 +135,20 @@ struct parsec_device_gpu_module_s {
     parsec_list_t              gpu_mem_lru;   /* Read-only blocks, and fresh blocks */
     parsec_list_t              gpu_mem_owned_lru;  /* Dirty blocks */
     parsec_fifo_t              pending;
+    parsec_fifo_t              complete_queue;
     struct zone_malloc_s      *memory;
     parsec_list_item_t        *sort_starting_p;
     parsec_gpu_exec_stream_t **exec_stream;
     size_t                     mem_block_size;
     int64_t                    mem_nb_blocks;
+    int                        last_exec_stream_index;
 };
 
 struct parsec_gpu_exec_stream_s {
     struct parsec_gpu_task_s        **tasks;
     char                             *name;
     int32_t                           max_events;  /* number of potential events, and tasks */
+    int32_t                           active_event_count;  /** events currently active*/
     int32_t                           executed;    /* number of executed tasks */
     int32_t                           start;  /* circular buffer management start and end positions */
     int32_t                           end;