diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp index 1f238b1fdbf7..d704bc7f9f71 100644 --- a/src/bvals/comms/boundary_communication.cpp +++ b/src/bvals/comms/boundary_communication.cpp @@ -94,7 +94,8 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { Kokkos::parallel_for( PARTHENON_AUTO_LABEL, - Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO), + Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO, + pmesh->GetCommVectorLength()), KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) { const int b = team_member.league_rank(); @@ -110,28 +111,42 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { auto &idxer = bnd_info(b).idxer[it]; const int iel = static_cast(bnd_info(b).topo_idx[it]) % 3; const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1; - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), - [&](const int idx, bool &lnon_zero) { - const auto [t, u, v, k, j, i] = idxer(idx * Ni); - Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); - Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); - - Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), - [&](int m) { buf[m] = var[m]; }); - - bool mnon_zero = false; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange<>(team_member, Ni), - [&](int m, bool &llnon_zero) { - llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold); - }, - Kokkos::LOr(mnon_zero)); - - lnon_zero = lnon_zero || mnon_zero; - if (bound_type == BoundaryType::flxcor_send) lnon_zero = true; - }, - Kokkos::LOr(non_zero[iel])); + if (threshold > 0.0) { + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx, bool &lnon_zero) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { buf[m] = var[m]; }); + + bool mnon_zero = false; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m, bool &llnon_zero) { + llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold); + }, + Kokkos::LOr(mnon_zero)); + + lnon_zero = lnon_zero || mnon_zero; + if (bound_type == BoundaryType::flxcor_send) lnon_zero = true; + }, + Kokkos::LOr(non_zero[iel])); + } else { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i); + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { buf[m] = var[m]; }); + }); + non_zero[iel] = true; + } idx_offset += idxer.size(); } Kokkos::single(Kokkos::PerTeam(team_member), [&]() { @@ -272,7 +287,8 @@ TaskStatus SetBounds(std::shared_ptr> &md) { auto &bnd_info = cache.bnd_info; Kokkos::parallel_for( PARTHENON_AUTO_LABEL, - Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO), + Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO, + pmesh->GetCommVectorLength()), KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) { const int b = team_member.league_rank(); if (bnd_info(b).same_to_same) return; @@ -280,54 +296,112 @@ TaskStatus SetBounds(std::shared_ptr> &md) { for (int it = 0; it < bnd_info(b).ntopological_elements; ++it) { auto &idxer = bnd_info(b).idxer[it]; auto &lcoord_trans = bnd_info(b).lcoord_trans; + const bool isTrivial = lcoord_trans.IsTrivial(); auto &var = bnd_info(b).var; const auto [tel, ftemp] = lcoord_trans.InverseTransform(bnd_info(b).topo_idx[it]); + const bool isCell = (tel == TopologicalElement::CC); Real fac = ftemp; // Can't capture structured bindings const int iel = static_cast(tel) % 3; const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1; if (bnd_info(b).buf_allocated && bnd_info(b).allocated) { - Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), - [&](const int idx) { - Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); - const auto [t, u, v, k, j, i] = idxer(idx * Ni); - // Have to do this because of some weird issue about structure bindings - // being captured - const int tt = t; - const int uu = u; - const int vv = v; - const int kk = k; - const int jj = j; - const int ii = i; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) { - const auto [il, jl, kl] = - lcoord_trans.InverseTransform({ii + m, jj, kk}); - if (idxer.IsActive(kl, jl, il)) - var(iel, tt, uu, vv, kl, jl, il) = fac * buf[m]; - }); - }); + if (isTrivial && isCell) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *var_ptr = &var(iel, t, u, v, k, j, i); + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { var_ptr[m] = buf[m]; }); + }); + } else if (isTrivial) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + // Have to do this because of some weird issue about structure + // bindings being captured + const int kk = k; + const int jj = j; + const int ii = i; + Real *var_ptr = &var(iel, t, u, v, kk, jj, ii); + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { + if (idxer.IsActive(kk, jj, ii + m)) + var_ptr[m] = buf[m]; + }); + }); + } else { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset); + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + // Have to do this because of some weird issue about structure + // bindings being captured + const int tt = t; + const int uu = u; + const int vv = v; + const int kk = k; + const int jj = j; + const int ii = i; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) { + const auto [il, jl, kl] = + lcoord_trans.InverseTransform({ii + m, jj, kk}); + if (idxer.IsActive(kl, jl, il)) + var(iel, tt, uu, vv, kl, jl, il) = fac * buf[m]; + }); + }); + } } else if (bnd_info(b).allocated && bound_type != BoundaryType::flxcor_recv) { const Real default_val = bnd_info(b).var.sparse_default_val; - Kokkos::parallel_for( - Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), - [&](const int idx) { - const auto [t, u, v, k, j, i] = idxer(idx * Ni); - const int tt = t; - const int uu = u; - const int vv = v; - const int kk = k; - const int jj = j; - const int ii = i; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) { - const auto [il, jl, kl] = - lcoord_trans.InverseTransform({ii + m, jj, kk}); - if (idxer.IsActive(kl, jl, il)) - var(iel, tt, uu, vv, kl, jl, il) = default_val; - }); - }); + if (isTrivial && isCell) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + Real *var_ptr = &var(iel, t, u, v, k, j, i); + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { var_ptr[m] = default_val; }); + }); + } else if (isTrivial) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + const int kk = k; + const int jj = j; + const int ii = i; + Real *var_ptr = &var(iel, t, u, v, k, j, i); + Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni), + [&](int m) { + if (idxer.IsActive(kk, jj, ii + m)) + var_ptr[m] = default_val; + }); + }); + } else { + Kokkos::parallel_for( + Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni), + [&](const int idx) { + const auto [t, u, v, k, j, i] = idxer(idx * Ni); + const int tt = t; + const int uu = u; + const int vv = v; + const int kk = k; + const int jj = j; + const int ii = i; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) { + const auto [il, jl, kl] = + lcoord_trans.InverseTransform({ii + m, jj, kk}); + if (idxer.IsActive(kl, jl, il)) + var(iel, tt, uu, vv, kl, jl, il) = default_val; + }); + }); + } } idx_offset += idxer.size(); } diff --git a/src/mesh/forest/logical_coordinate_transformation.hpp b/src/mesh/forest/logical_coordinate_transformation.hpp index 0e0d0bc293cf..055c3df5a39a 100644 --- a/src/mesh/forest/logical_coordinate_transformation.hpp +++ b/src/mesh/forest/logical_coordinate_transformation.hpp @@ -52,6 +52,13 @@ struct LogicalCoordinateTransformation { std::int64_t origin) const; CellCentOffsets Transform(CellCentOffsets in) const; + // Check if this transformation includes at most a translation + KOKKOS_INLINE_FUNCTION + bool IsTrivial() const { + return (dir_connection[0] == 0) && (dir_connection[1] == 1) && + (dir_connection[2] == 2) && !dir_flip[0] && !dir_flip[1] && !dir_flip[2]; + } + KOKKOS_INLINE_FUNCTION std::tuple Transform(TopologicalElement el) const { int iel = static_cast(el); diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp index 2672155e83dc..b0e27b109e72 100644 --- a/src/mesh/mesh.cpp +++ b/src/mesh/mesh.cpp @@ -77,6 +77,8 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages, default_pack_size_(pin->GetOrAddInteger("parthenon/mesh", "pack_size", -1)), // private members: num_mesh_threads_(pin->GetOrAddInteger("parthenon/mesh", "num_threads", 1)), + comm_vector_length_( + pin->GetOrAddInteger("parthenon/mesh", "comm_vector_length", 1)), use_uniform_meshgen_fn_{true, true, true, true}, lb_flag_(true), lb_automatic_(), lb_manual_(), nslist(Globals::nranks), nblist(Globals::nranks), nref(Globals::nranks), nderef(Globals::nranks), rdisp(Globals::nranks), diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp index ca221d3f6e56..eb41ebe3faca 100644 --- a/src/mesh/mesh.hpp +++ b/src/mesh/mesh.hpp @@ -99,6 +99,8 @@ class Mesh { return nblist[my_rank]; } int GetNumMeshThreads() const { return num_mesh_threads_; } + int GetCommVectorLength() const { return comm_vector_length_; } + std::int64_t GetTotalCells(); // TODO(JMM): Move block_size into mesh. int GetNumberOfMeshBlockCells() const; @@ -258,6 +260,7 @@ class Mesh { // data int root_level, max_level, current_level; int num_mesh_threads_; + int comm_vector_length_; /// Maps Global Block IDs to which rank the block is mapped to. std::vector ranklist; /// Maps rank to start of local block IDs.