-
Notifications
You must be signed in to change notification settings - Fork 17
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allocate host copy on demand once data is moved to the host #711
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1177,6 +1177,7 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t *gtask, | |
for(int i = 0; i < task->task_class->nb_flows; i++) { | ||
if( !(flow_mask & (1U << i)) ) continue; | ||
source = gtask->sources[i]; | ||
assert(source->device_private != NULL); | ||
dest = task->data[i].data_out; | ||
src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index); | ||
dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index); | ||
|
@@ -1237,6 +1238,12 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t *gtask, | |
dir = parsec_device_gpu_transfer_direction_d2d; | ||
} else { | ||
dir = parsec_device_gpu_transfer_direction_d2h; | ||
if (dest->device_private == NULL && dest->alloc_cb != NULL) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the copy on the main memory somehow exists but it points to non existing private data (aka. it has been never used before). Why does it exists then ? And why do you want to allocate it via a mechanism different than the traditional arena (which is more or less similar to this but instead of having a specialized alloc/free per copy we have it per arena) ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. We always have a host copy, which may not have device private memory allocated. That is the way we currently provide inputs for flows (instead of data). Unless we overhaul PaRSEC we need to pass a data copy. And I don't want to allocate device private for all of them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only reason we currently pass the CPU copy between tasks, is because PaRSEC lacks a proper mechanism to acquire the accelerator data from the CPU (other than the pushout mechanism at the end of the execution of the GPU task). But, with the PR that allows data to be sent and received directly from/to GPU memory, there is no reason to ever need the data on the CPU, unless for final step of the computation before returning it to the user (in which case we could force a pushout). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So then an application creates GPU copies and allocates memory on devices on devices randomly selected for all tasks submitted? That defeats the automatic device selection and creates massive pressure on the zone allocators, for no good reason. The clean way is to pass There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we pass the |
||
dest->alloc_cb(dest, 0); // allocate on host | ||
} | ||
if (dest->device_private == NULL) { | ||
return PARSEC_HOOK_RETURN_ERROR; | ||
} | ||
} | ||
ret = src_dev->memcpy_async( src_dev, gpu_stream, | ||
dest->device_private, | ||
|
@@ -1660,6 +1667,48 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t *gpu_device, | |
parsec_data_end_transfer_ownership_to_copy(task->data[i].data_out->original, | ||
gpu_device->super.device_index, | ||
flow->flow_flags); | ||
|
||
parsec_data_copy_t* source = gtask->sources[i]; | ||
parsec_device_gpu_module_t *src_device = | ||
(parsec_device_gpu_module_t*)parsec_mca_device_get( source->device_index ); | ||
if (task->data[i].data_in->flags & PARSEC_DATA_FLAG_EVICTED) { | ||
/** | ||
* The device copy had been evicted to the host and brought back in. | ||
* If this is the only device on which that data is used we can release | ||
* the host memory back to the application. If there are other devices | ||
* we cannot release the host memory because the data may actually be used | ||
* by a host task (e.g., after being sent there from a different device) | ||
* or be used as input to the other device. | ||
*/ | ||
parsec_data_copy_t *cpu_copy = task->data[i].data_out->original->device_copies[0]; | ||
parsec_data_copy_t *gpu_copy = task->data[i].data_out; | ||
parsec_data_t *original = task->data[i].data_out->original; | ||
/* release host memory if requested */ | ||
if (cpu_copy->device_private != NULL && | ||
bosilca marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cpu_copy->release_cb != NULL) { | ||
bool may_release = true; | ||
/* check if there are any other device copies */ | ||
for (uint32_t i = 1; i < parsec_nb_devices; ++i) { | ||
parsec_data_copy_t *copy = original->device_copies[i]; | ||
if (NULL != copy && copy != gpu_copy) { | ||
may_release = false; | ||
break; | ||
} | ||
} | ||
if (may_release) { | ||
PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, | ||
"GPU[%d:%s]: CPU copy %p [ref_count %d] memory %p eager release", | ||
gpu_device->super.device_index, gpu_device->super.name, | ||
cpu_copy, cpu_copy->super.super.obj_reference_count, cpu_copy->device_private); | ||
cpu_copy->release_cb(cpu_copy, 0); | ||
cpu_copy->device_private = NULL; | ||
} | ||
} | ||
task->data[i].data_in->flags ^= PARSEC_DATA_FLAG_EVICTED; | ||
} | ||
|
||
parsec_atomic_unlock(&task->data[i].data_out->original->lock); | ||
|
||
#if defined(PARSEC_PROF_TRACE) | ||
if(gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_DATA_IN) { | ||
PARSEC_PROFILING_TRACE(gpu_stream->profiling, | ||
|
@@ -1669,10 +1718,6 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t *gpu_device, | |
NULL); | ||
} | ||
#endif | ||
parsec_atomic_unlock(&task->data[i].data_out->original->lock); | ||
parsec_data_copy_t* source = gtask->sources[i]; | ||
parsec_device_gpu_module_t *src_device = | ||
(parsec_device_gpu_module_t*)parsec_mca_device_get( source->device_index ); | ||
if( PARSEC_DEV_IS_GPU(src_device->super.type) ) { | ||
int om; | ||
while(1) { | ||
|
@@ -1998,7 +2043,8 @@ parsec_device_kernel_push( parsec_device_gpu_module_t *gpu_device, | |
if( NULL != this_task->data[i].data_out && | ||
(0 == (this_task->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) ) continue; | ||
|
||
assert( NULL != parsec_data_copy_get_ptr(this_task->data[i].data_in) ); | ||
assert( NULL != parsec_data_copy_get_ptr(this_task->data[i].data_in) | ||
|| NULL != this_task->data[i].data_in->alloc_cb ); | ||
|
||
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, | ||
"GPU[%d:%s]:\t\tIN Data of %s <%x> on GPU", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this needed ? A predecessor can provide a NULL data on a otherwise valid dependency. How is that case handled ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We cannot stage in data that does not exist. If we get here (because of RW or RO) the application requested data to be pushed onto the device. If the source has no memory then we cannot make up data. I consider this an error.