Skip to content

Commit

Permalink
Merge branch 'parsec-gpu-task-free' of github.com:devreal/ttg into pa…
Browse files Browse the repository at this point in the history
…rsec-gpu-task-free
  • Loading branch information
devreal committed Jan 9, 2025
2 parents 26c3358 + 17de4ce commit 4bf379e
Show file tree
Hide file tree
Showing 25 changed files with 413 additions and 102 deletions.
2 changes: 0 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,6 @@ endif(MPI_FOUND)
if (TTG_EXAMPLES)
# TiledArray brings in BTAS AND linear algebra (BLAS++/LAPACK++)
include(FindOrFetchTiledArray)
# OpenMP may also be used by some examples
find_package(OpenMP COMPONENTS CXX)
# std::execution may also be used by some examples
find_package(CXXStdExecution)
endif (TTG_EXAMPLES)
Expand Down
2 changes: 1 addition & 1 deletion cmake/modules/ExternalDependenciesVersions.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS)
# to be able to auto-update them

set(TTG_TRACKED_VG_CMAKE_KIT_TAG d1b34157c349cf0a7c2f149b7704a682d53f6486) # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
set(TTG_TRACKED_VG_CMAKE_KIT_TAG 878654d0cb1904049fbd2c37b37d5385ae897658) # provides FindOrFetchLinalgPP and "real" FindOrFetchBoost
set(TTG_TRACKED_CATCH2_VERSION 3.5.0)
set(TTG_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058)
set(TTG_TRACKED_PARSEC_TAG c97e2fc54698d3d937d7847a12c7e9084b22a6c8)
Expand Down
4 changes: 2 additions & 2 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ add_ttg_executable(fw-apsp floyd-warshall/floyd_warshall.cc LINK_LIBRARIES MADwo
add_ttg_executable(helloworld helloworld/helloworld.cpp)
add_ttg_executable(simplegenerator simplegenerator/simplegenerator.cc RUNTIMES "mad")

if (OpenMP_CXX_FOUND AND TARGET std::execution)
add_ttg_executable(fw-apsp-df floyd-warshall/floyd_warshall_df.cc LINK_LIBRARIES OpenMP::OpenMP_CXX std::execution MADworld)
if (TARGET std::execution)
add_ttg_executable(fw-apsp-df floyd-warshall/floyd_warshall_df.cc LINK_LIBRARIES std::execution MADworld)
endif ()
add_ttg_executable(ge ge/ge.cc SINGLERANKONLY)
if (TARGET std::execution)
Expand Down
2 changes: 0 additions & 2 deletions examples/floyd-warshall/floyd_warshall_df.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ using namespace ttg;
#include "../blockmatrix.h"
#include "ttg/util/bug.h"

//#include <omp.h> //

#include "FW-APSP/FloydIterativeKernelDF.h" // contains the iterative kernel
#include "FW-APSP/FloydRecursiveSerialKernelDF.h" // contains the recursive but serial kernels
// #include "FloydRecursiveParallelKernel.h" // contains the recursive and parallel kernels
Expand Down
15 changes: 14 additions & 1 deletion tests/unit/device_coro.cc
Original file line number Diff line number Diff line change
Expand Up @@ -426,13 +426,26 @@ TEST_CASE("Device", "coro") {
#endif // TTG_HAVE_CUDA
}
};

auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(edge),
"device_task", {"edge_in"}, {"edge_out"});
ttg::make_graph_executable(tt);
if (ttg::default_execution_context().rank() == 0) tt->invoke(0, value_t{});
ttg::ttg_fence(ttg::default_execution_context());
}

SECTION("empty-select") {
ttg::Edge<void, void> edge;
auto fn = []() -> ttg::device::Task {
co_await ttg::device::select();
/* nothing else to do */
};
auto tt = ttg::make_tt<ttg::ExecutionSpace::CUDA>(fn, ttg::edges(edge), ttg::edges(),
"device_task", {"edge_in"}, {"edge_out"});
ttg::make_graph_executable(tt);
tt->invoke();
ttg::ttg_fence(ttg::default_execution_context());
};

}

#endif // TTG_IMPL_DEVICE_SUPPORT
5 changes: 4 additions & 1 deletion ttg/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ if (TARGET MADworld)
set(ttg-mad-headers
${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/buffer.h
${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/device.h
${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/devicefunc.h
${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/fwd.h
${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/import.h
${CMAKE_CURRENT_SOURCE_DIR}/ttg/madness/ttg.h
Expand Down Expand Up @@ -247,8 +248,10 @@ if (TARGET PaRSEC::parsec)
# parsec depends on TTG's serialization layer since it does not provide its own
if (TTG_PARSEC_USE_BOOST_SERIALIZATION AND TARGET ttg-serialization-boost)
list(APPEND ttg-parsec-deps ttg-serialization-boost)
elseif(TARGET ttg-serialization-madness)
list(APPEND ttg-parsec-deps ttg-serialization-madness)
else()
list(APPEND ttg-parsec-deps ttg-serialization)
message(WARNING "missing full-featured serialization support for ttg-parsec: either TTG_PARSEC_USE_BOOST_SERIALIZATION=OFF or Boost not found, and MADNESS not found")
endif()
add_ttg_library(ttg-parsec "${ttg-parsec-headers}"
PUBLIC_HEADER "${ttg-parsec-headers}"
Expand Down
12 changes: 6 additions & 6 deletions ttg/ttg.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
#include "ttg/config.h"
#include "ttg/fwd.h"

#if defined(TTG_USE_PARSEC)
#include "ttg/parsec/ttg.h"
#elif defined(TTG_USE_MADNESS)
#include "ttg/madness/ttg.h"
#endif // TTG_USE_{PARSEC|MADNESS}

#include "ttg/runtimes.h"
#include "ttg/util/demangle.h"
#include "ttg/util/hash.h"
Expand Down Expand Up @@ -37,12 +43,6 @@
#include "ttg/device/device.h"
#include "ttg/device/task.h"

#if defined(TTG_USE_PARSEC)
#include "ttg/parsec/ttg.h"
#elif defined(TTG_USE_MADNESS)
#include "ttg/madness/ttg.h"
#endif // TTG_USE_{PARSEC|MADNESS}

// these headers use the default backend
#include "ttg/run.h"

Expand Down
1 change: 1 addition & 0 deletions ttg/ttg/base/tt.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include "ttg/base/terminal.h"
#include "ttg/util/demangle.h"
#include "ttg/util/trace.h"

namespace ttg {

Expand Down
20 changes: 20 additions & 0 deletions ttg/ttg/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,32 @@
#include <memory>

#include "ttg/fwd.h"
#include "ttg/util/meta.h"

namespace ttg {

template<typename T, typename Allocator = std::allocator<std::decay_t<T>>>
using Buffer = TTG_IMPL_NS::Buffer<T, Allocator>;

namespace meta {

/* Specialize some traits */

template<typename T, typename A>
struct is_buffer<ttg::Buffer<T, A>> : std::true_type
{ };

template<typename T, typename A>
struct is_buffer<const ttg::Buffer<T, A>> : std::true_type
{ };

/* buffers are const if their value types are const */
template<typename T, typename A>
struct is_const<ttg::Buffer<T, A>> : std::is_const<T>
{ };

} // namespace meta

} // namespace ttg

#endif // TTG_buffer_H
4 changes: 4 additions & 0 deletions ttg/ttg/device/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ namespace ttg::device {
bool is_invalid() const {
return (m_space == ttg::ExecutionSpace::Invalid);
}

static Device host() {
return {};
}
};
} // namespace ttg::device

Expand Down
85 changes: 78 additions & 7 deletions ttg/ttg/device/task.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,80 @@
#include <type_traits>
#include <span>


#include "ttg/fwd.h"
#include "ttg/impl_selector.h"
#include "ttg/ptr.h"
#include "ttg/devicescope.h"

#ifdef TTG_HAVE_COROUTINE

namespace ttg::device {

namespace detail {

struct device_input_data_t {
using impl_data_t = decltype(TTG_IMPL_NS::buffer_data(std::declval<ttg::Buffer<int>>()));

device_input_data_t(impl_data_t data, ttg::scope scope, bool isconst, bool isscratch)
: impl_data(data), scope(scope), is_const(isconst), is_scratch(isscratch)
{ }
impl_data_t impl_data;
ttg::scope scope;
bool is_const;
bool is_scratch;
};

template <typename... Ts>
struct to_device_t {
std::tuple<std::add_lvalue_reference_t<Ts>...> ties;
};

/* extract buffer information from to_device_t */
template<typename... Ts, std::size_t... Is>
auto extract_buffer_data(detail::to_device_t<Ts...>& a, std::index_sequence<Is...>) {
using arg_types = std::tuple<Ts...>;
return std::array<device_input_data_t, sizeof...(Ts)>{
device_input_data_t{TTG_IMPL_NS::buffer_data(std::get<Is>(a.ties)),
std::get<Is>(a.ties).scope(),
ttg::meta::is_const_v<std::tuple_element_t<Is, arg_types>>,
ttg::meta::is_devicescratch_v<std::tuple_element_t<Is, arg_types>>}...};
}
} // namespace detail

struct Input {
private:
std::vector<detail::device_input_data_t> m_data;

public:
Input() { }
template<typename... Args>
Input(Args&&... args)
: m_data{{TTG_IMPL_NS::buffer_data(args), args.scope(),
std::is_const_v<std::remove_reference_t<Args>>,
ttg::meta::is_devicescratch_v<std::decay_t<Args>>}...}
{ }

template<typename T>
void add(T&& v) {
using type = std::remove_reference_t<T>;
m_data.emplace_back(TTG_IMPL_NS::buffer_data(v), v.scope(), std::is_const_v<type>,
ttg::meta::is_devicescratch_v<type>);
}

ttg::span<detail::device_input_data_t> span() {
return ttg::span(m_data);
}
};

namespace detail {
// overload for Input
template <>
struct to_device_t<Input> {
Input& input;
};
} // namespace detail

/**
* Select a device to execute on based on the provided buffer and scratchspace objects.
* Returns an object that should be awaited on using \c co_await.
Expand All @@ -33,6 +92,11 @@ namespace ttg::device {
return detail::to_device_t<std::remove_reference_t<Args>...>{std::tie(std::forward<Args>(args)...)};
}

[[nodiscard]]
inline auto select(Input& input) {
return detail::to_device_t<Input>{input};
}

namespace detail {

enum ttg_device_coro_state {
Expand Down Expand Up @@ -244,8 +308,7 @@ namespace ttg::device {
}


template <std::size_t i, typename valueT, typename... out_keysT, typename... out_valuesT,
ttg::Runtime Runtime = ttg::ttg_runtime>
template <std::size_t i, typename valueT, ttg::Runtime Runtime = ttg::ttg_runtime>
inline detail::send_t sendv(valueT &&value) {
return sendv(i, std::forward<valueT>(value));
}
Expand Down Expand Up @@ -283,8 +346,7 @@ namespace ttg::device {
}
}

template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT,
typename... out_keysT, typename... out_valuesT>
template <size_t KeyId, size_t I, size_t... Is, typename... RangesT, typename valueT>
inline void prepare_broadcast(const std::tuple<RangesT...> &keylists, valueT &&value) {
using key_t = typename broadcast_keylist_trait<
std::tuple_element_t<KeyId, std::tuple<std::remove_reference_t<RangesT>...>>
Expand Down Expand Up @@ -448,8 +510,9 @@ namespace ttg::device {
ttg::Runtime Runtime = ttg::ttg_runtime>
inline detail::send_t broadcast(rangeT &&keylist, valueT &&value) {
ttg::detail::value_copy_handler<Runtime> copy_handler;
return detail::send_t{broadcast_coro<i>(std::tie(keylist), copy_handler(std::forward<valueT>(value)),
std::move(copy_handler))};
return detail::send_t{detail::broadcast_coro<i>(std::tie(keylist),
copy_handler(std::forward<valueT>(value)),
std::move(copy_handler))};
}

/* overload with explicit terminals and keylist passed by const reference */
Expand Down Expand Up @@ -556,7 +619,15 @@ namespace ttg::device {

template<typename... Ts>
ttg::suspend_always await_transform(detail::to_device_t<Ts...>&& a) {
bool need_transfer = !(TTG_IMPL_NS::register_device_memory(a.ties));
auto arr = detail::extract_buffer_data(a, std::make_index_sequence<sizeof...(Ts)>{});
bool need_transfer = !(TTG_IMPL_NS::register_device_memory(ttg::span(arr)));
/* TODO: are we allowed to not suspend here and launch the kernel directly? */
m_state = ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER;
return {};
}

ttg::suspend_always await_transform(detail::to_device_t<Input>&& a) {
bool need_transfer = !(TTG_IMPL_NS::register_device_memory(a.input.span()));
/* TODO: are we allowed to not suspend here and launch the kernel directly? */
m_state = ttg::device::detail::TTG_DEVICE_CORO_WAIT_TRANSFER;
return {};
Expand Down
19 changes: 19 additions & 0 deletions ttg/ttg/devicescratch.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "ttg/devicescope.h"
#include "ttg/fwd.h"
#include "ttg/util/meta.h"

namespace ttg {

Expand All @@ -14,6 +15,24 @@ auto make_scratch(T* val, ttg::scope scope, std::size_t count = 1) {
return devicescratch<T>(val, scope, count);
}

namespace meta {

/* Specialize some traits */

template<typename T>
struct is_devicescratch<ttg::devicescratch<T>> : std::true_type
{ };

template<typename T>
struct is_devicescratch<const ttg::devicescratch<T>> : std::true_type
{ };

template<typename T>
struct is_const<ttg::devicescratch<T>> : std::is_const<T>
{ };

} // namespace meta

} // namespace ttg

#endif // TTG_DEVICESCRATCH_H
12 changes: 12 additions & 0 deletions ttg/ttg/madness/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

#include "ttg/serialization/traits.h"

#include "ttg/device/device.h"

namespace ttg_madness {

/// A runtime-managed buffer mirrored between host and device memory
Expand Down Expand Up @@ -110,6 +112,12 @@ struct Buffer : private Allocator {
/* no-op */
}


bool is_current_on(ttg::device::Device dev) const {
assert(is_valid());
return true;
}

/* Get the owner device ID, i.e., the last updated
* device buffer. */
ttg::device::Device get_owner_device() const {
Expand Down Expand Up @@ -178,6 +186,10 @@ struct Buffer : private Allocator {
throw std::runtime_error("not implemented yet");
}

bool empty() const {
return (m_host_data == nullptr);
}

/* TODO: can we do this automatically?
* Pin the memory on all devices we currently track.
* Pinned memory won't be released by PaRSEC and can be used
Expand Down
Loading

0 comments on commit 4bf379e

Please sign in to comment.