diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h index a44572018..f1ef7c1e6 100644 --- a/ttg/ttg/parsec/devicefunc.h +++ b/ttg/ttg/parsec/devicefunc.h @@ -114,8 +114,8 @@ namespace ttg_parsec { uint8_t i; // only limited number of flows detail::parsec_ttg_task_base_t *caller = detail::parsec_ttg_caller; assert(nullptr != caller->dev_ptr); + caller->dev_ptr->gpu_task->allocate_flows(span.size()); parsec_gpu_task_t *gpu_task = caller->dev_ptr->gpu_task; - parsec_flow_t *flows = caller->dev_ptr->flows; bool is_current = false; for (i = 0; i < span.size(); ++i) { @@ -140,14 +140,15 @@ namespace ttg_parsec { /* build the flow */ /* TODO: reuse the flows of the task class? How can we control the sync direction then? */ - flows[i] = parsec_flow_t{.name = nullptr, + *((parsec_flow_t*)gpu_task->flow_info[i].flow) = + parsec_flow_t{.name = nullptr, .sym_type = PARSEC_SYM_INOUT, .flow_flags = static_cast(access), .flow_index = i, .flow_datatype_mask = ~0 }; - gpu_task->flow_nb_elts[i] = data->nb_elts; // size in bytes - gpu_task->flow[i] = &flows[i]; + gpu_task->flow_info[i].flow_span = data->span; // size in bytes + gpu_task->flow_info[i].flow_dc = nullptr; /* set the input data copy, parsec will take care of the transfer * and the buffer will look at the parsec_data_t for the current pointer */ @@ -158,25 +159,17 @@ namespace ttg_parsec { } else { /* ignore the flow */ - flows[i] = parsec_flow_t{.name = nullptr, + *((parsec_flow_t*)gpu_task->flow_info[i].flow) = + parsec_flow_t{.name = nullptr, .sym_type = PARSEC_FLOW_ACCESS_NONE, .flow_flags = 0, .flow_index = i, .flow_datatype_mask = ~0 }; - gpu_task->flow[i] = &flows[i]; - gpu_task->flow_nb_elts[i] = 0; // size in bytes + gpu_task->flow_info[i].flow_span = 0; // size in bytes caller->parsec_task.data[i].data_in = nullptr; } } - /* reset all remaining entries in the current task */ - for (; i < MAX_PARAM_COUNT; ++i) { - detail::parsec_ttg_caller->parsec_task.data[i].data_in = nullptr; - detail::parsec_ttg_caller->dev_ptr->flows[i].flow_flags = PARSEC_FLOW_ACCESS_NONE; - detail::parsec_ttg_caller->dev_ptr->flows[i].flow_index = i; - detail::parsec_ttg_caller->dev_ptr->gpu_task->flow[i] = &detail::parsec_ttg_caller->dev_ptr->flows[i]; - detail::parsec_ttg_caller->dev_ptr->gpu_task->flow_nb_elts[i] = 0; - } // we cannot allow the calling thread to submit kernels so say we're not ready return is_current; } @@ -198,7 +191,7 @@ namespace ttg_parsec { int ret = device_module->memcpy_async(device_module, stream, data->device_copies[0]->device_private, data->device_copies[data->owner_device]->device_private, - data->nb_elts, parsec_device_gpu_transfer_direction_d2h); + data->span, parsec_device_gpu_transfer_direction_d2h); assert(ret == PARSEC_SUCCESS); } if constexpr (sizeof...(Is) > 0) { diff --git a/ttg/ttg/parsec/task.h b/ttg/ttg/parsec/task.h index 5df3aca0f..2080aaf29 100644 --- a/ttg/ttg/parsec/task.h +++ b/ttg/ttg/parsec/task.h @@ -16,17 +16,15 @@ namespace ttg_parsec { if (this->memory != nullptr) free_flows(); constexpr const auto align = std::align_val_t(std::max(alignof(parsec_flow_t), alignof(parsec_gpu_flow_info_t))); this->memory = new(align) std::byte[size * (sizeof(parsec_flow_t) + sizeof(parsec_gpu_flow_info_s))]; - if (this->flow_info != nullptr) { - parsec_flow_t *flows = (parsec_flow_t*)this->memory; - this->flow_info = (parsec_gpu_flow_info_t*)(this->memory + size * sizeof(parsec_flow_t)); - for (std::size_t i = 0; i < size; ++i) { - this->flow_info[i].flow = &flows[i]; - flows[i].flow_index = i; - flows[i].flow_flags = 0; - flows[i].flow_datatype_mask = ~0; - } - this->nb_flows = size; + parsec_flow_t *flows = (parsec_flow_t*)this->memory; + this->flow_info = (parsec_gpu_flow_info_t*)(this->memory + size * sizeof(parsec_flow_t)); + for (std::size_t i = 0; i < size; ++i) { + this->flow_info[i].flow = &flows[i]; + flows[i].flow_index = i; + flows[i].flow_flags = 0; + flows[i].flow_datatype_mask = ~0; } + this->nb_flows = size; } void free_flows() { diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h index 1c439ce4c..597d29800 100644 --- a/ttg/ttg/parsec/ttg.h +++ b/ttg/ttg/parsec/ttg.h @@ -1466,10 +1466,10 @@ namespace ttg_parsec { ttg::device::detail::reset_current(); auto discard_tmp_flows = [&](){ - for (int i = 0; i < MAX_PARAM_COUNT; ++i) { - if (gpu_task->flow[i]->flow_flags & TTG_PARSEC_FLOW_ACCESS_TMP) { + for (int i = 0; i < gpu_task->nb_flows; ++i) { + if (gpu_task->flow_info[i].flow->flow_flags & TTG_PARSEC_FLOW_ACCESS_TMP) { /* temporary flow, discard by setting it to read-only to avoid evictions */ - const_cast(gpu_task->flow[i])->flow_flags = PARSEC_FLOW_ACCESS_READ; + const_cast(gpu_task->flow_info[i].flow)->flow_flags = PARSEC_FLOW_ACCESS_READ; task->parsec_task.data[i].data_out->readers = 1; } }