From ad7469a8cdc1e7999ac559ceffdb30804056a5d7 Mon Sep 17 00:00:00 2001 From: Ben Prather Date: Wed, 31 Jul 2024 16:43:06 -0600 Subject: [PATCH 1/3] Add par_reduce_inner functions --- src/kokkos_abstraction.hpp | 60 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index ca8c59ffe12e..91da43dfd65d 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -976,6 +976,66 @@ KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(team_mbr_t team_member, Args &&.. par_for_inner(DEFAULT_INNER_LOOP_PATTERN, team_member, std::forward(args)...); } +// Inner reduction loops +template +KOKKOS_FORCEINLINE_FUNCTION void +par_reduce_inner(team_mbr_t team_member, const int kl, const int ku, const int jl, + const int ju, const int il, const int iu, const Function &function, + T reduction) { + const int Nk = ku - kl + 1; + const int Nj = ju - jl + 1; + const int Ni = iu - il + 1; + const int NkNjNi = Nk * Nj * Ni; + const int NjNi = Nj * Ni; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team_member, NkNjNi), + [&](const int &idx, typename T::value_type &lreduce) { + int k = idx / NjNi; + int j = (idx - k * NjNi) / Ni; + int i = idx - k * NjNi - j * Ni; + k += kl; + j += jl; + i += il; + function(k, j, i, lreduce); + }, + reduction); +} + +template +KOKKOS_FORCEINLINE_FUNCTION void +par_reduce_inner(team_mbr_t team_member, const int jl, + const int ju, const int il, const int iu, const Function &function, + T reduction) { + const int Nj = ju - jl + 1; + const int Ni = iu - il + 1; + const int NjNi = Nj * Ni; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team_member, NjNi), + [&](const int &idx, typename T::value_type &lreduce) { + int j = idx / Ni; + int i = idx - j * Ni; + j += jl; + i += il; + function(j, i, lreduce); + }, + reduction); +} + +template +KOKKOS_FORCEINLINE_FUNCTION void +par_reduce_inner(team_mbr_t team_member, const int il, const int iu, const Function &function, + T reduction) { + const int Ni = iu - il + 1; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team_member, Ni), + [&](const int &idx, typename T::value_type &lreduce) { + int i = idx; + i += il; + function(i, lreduce); + }, + reduction); +} + // reused from kokoks/core/perf_test/PerfTest_ExecSpacePartitioning.cpp // commit a0d011fb30022362c61b3bb000ae3de6906cb6a7 template From 0beb9472fc751adff02f6595da10195a37171a1a Mon Sep 17 00:00:00 2001 From: Ben Prather Date: Fri, 2 Aug 2024 09:44:03 -0600 Subject: [PATCH 2/3] Formatting, changelog --- CHANGELOG.md | 1 + src/kokkos_abstraction.hpp | 11 +++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9cb66c3d573f..bd898db7795d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Current develop ### Added (new features/APIs/variables/...) +- [[PR 1147]](https://github.com/parthenon-hpc-lab/parthenon/pull/1147) Add `par_reduce_inner` functions - [[PR 1140]](https://github.com/parthenon-hpc-lab/parthenon/pull/1140) Allow for relative convergence tolerance in BiCGSTAB solver. - [[PR 1047]](https://github.com/parthenon-hpc-lab/parthenon/pull/1047) General three- and four-valent 2D forests w/ arbitrary orientations. - [[PR 1130]](https://github.com/parthenon-hpc-lab/parthenon/pull/1130) Enable `parthenon::par_reduce` for MD loops with Kokkos 1D Range diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 91da43dfd65d..77a0a4af5e55 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -1003,9 +1003,8 @@ par_reduce_inner(team_mbr_t team_member, const int kl, const int ku, const int j template KOKKOS_FORCEINLINE_FUNCTION void -par_reduce_inner(team_mbr_t team_member, const int jl, - const int ju, const int il, const int iu, const Function &function, - T reduction) { +par_reduce_inner(team_mbr_t team_member, const int jl, const int ju, const int il, + const int iu, const Function &function, T reduction) { const int Nj = ju - jl + 1; const int Ni = iu - il + 1; const int NjNi = Nj * Ni; @@ -1022,9 +1021,9 @@ par_reduce_inner(team_mbr_t team_member, const int jl, } template -KOKKOS_FORCEINLINE_FUNCTION void -par_reduce_inner(team_mbr_t team_member, const int il, const int iu, const Function &function, - T reduction) { +KOKKOS_FORCEINLINE_FUNCTION void par_reduce_inner(team_mbr_t team_member, const int il, + const int iu, const Function &function, + T reduction) { const int Ni = iu - il + 1; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team_member, Ni), From b6d39c53e08fe3658fd73e14c7a4c5506dca7c89 Mon Sep 17 00:00:00 2001 From: Ben Prather Date: Thu, 22 Aug 2024 15:02:26 -0600 Subject: [PATCH 3/3] Be explicit that existing inner reductions are TeamThreadRange --- src/kokkos_abstraction.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index 77a0a4af5e55..8fa89f82e95e 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -979,9 +979,9 @@ KOKKOS_FORCEINLINE_FUNCTION void par_for_inner(team_mbr_t team_member, Args &&.. // Inner reduction loops template KOKKOS_FORCEINLINE_FUNCTION void -par_reduce_inner(team_mbr_t team_member, const int kl, const int ku, const int jl, - const int ju, const int il, const int iu, const Function &function, - T reduction) { +par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int kl, const int ku, + const int jl, const int ju, const int il, const int iu, + const Function &function, T reduction) { const int Nk = ku - kl + 1; const int Nj = ju - jl + 1; const int Ni = iu - il + 1; @@ -1003,8 +1003,8 @@ par_reduce_inner(team_mbr_t team_member, const int kl, const int ku, const int j template KOKKOS_FORCEINLINE_FUNCTION void -par_reduce_inner(team_mbr_t team_member, const int jl, const int ju, const int il, - const int iu, const Function &function, T reduction) { +par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int jl, const int ju, + const int il, const int iu, const Function &function, T reduction) { const int Nj = ju - jl + 1; const int Ni = iu - il + 1; const int NjNi = Nj * Ni; @@ -1021,9 +1021,9 @@ par_reduce_inner(team_mbr_t team_member, const int jl, const int ju, const int i } template -KOKKOS_FORCEINLINE_FUNCTION void par_reduce_inner(team_mbr_t team_member, const int il, - const int iu, const Function &function, - T reduction) { +KOKKOS_FORCEINLINE_FUNCTION void +par_reduce_inner(InnerLoopPatternTTR, team_mbr_t team_member, const int il, const int iu, + const Function &function, T reduction) { const int Ni = iu - il + 1; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team_member, Ni),