Skip to content

Commit

Permalink
minimize junk in inner comms loops to the extent possible
Browse files Browse the repository at this point in the history
  • Loading branch information
lroberts36 committed Sep 12, 2024
1 parent e8ed786 commit acafaaf
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 63 deletions.
200 changes: 137 additions & 63 deletions src/bvals/comms/boundary_communication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {

Kokkos::parallel_for(
PARTHENON_AUTO_LABEL,
Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO,
pmesh->GetCommVectorLength()),
KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
const int b = team_member.league_rank();

Expand All @@ -110,28 +111,42 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
auto &idxer = bnd_info(b).idxer[it];
const int iel = static_cast<int>(bnd_info(b).topo_idx[it]) % 3;
const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1;
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx, bool &lnon_zero) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i);
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);

Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { buf[m] = var[m]; });

bool mnon_zero = false;
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m, bool &llnon_zero) {
llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold);
},
Kokkos::LOr<bool, parthenon::DevMemSpace>(mnon_zero));

lnon_zero = lnon_zero || mnon_zero;
if (bound_type == BoundaryType::flxcor_send) lnon_zero = true;
},
Kokkos::LOr<bool, parthenon::DevMemSpace>(non_zero[iel]));
if (threshold > 0.0) {
Kokkos::parallel_reduce(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx, bool &lnon_zero) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i);
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);

Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { buf[m] = var[m]; });

bool mnon_zero = false;
Kokkos::parallel_reduce(
Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m, bool &llnon_zero) {
llnon_zero = llnon_zero || (std::abs(buf[m]) >= threshold);
},
Kokkos::LOr<bool, parthenon::DevMemSpace>(mnon_zero));

lnon_zero = lnon_zero || mnon_zero;
if (bound_type == BoundaryType::flxcor_send) lnon_zero = true;
},
Kokkos::LOr<bool, parthenon::DevMemSpace>(non_zero[iel]));
} else {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var = &bnd_info(b).var(iel, t, u, v, k, j, i);
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);

Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { buf[m] = var[m]; });
});
non_zero[iel] = true;
}
idx_offset += idxer.size();
}
Kokkos::single(Kokkos::PerTeam(team_member), [&]() {
Expand Down Expand Up @@ -272,62 +287,121 @@ TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
auto &bnd_info = cache.bnd_info;
Kokkos::parallel_for(
PARTHENON_AUTO_LABEL,
Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO,
pmesh->GetCommVectorLength()),
KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
const int b = team_member.league_rank();
if (bnd_info(b).same_to_same) return;
int idx_offset = 0;
for (int it = 0; it < bnd_info(b).ntopological_elements; ++it) {
auto &idxer = bnd_info(b).idxer[it];
auto &lcoord_trans = bnd_info(b).lcoord_trans;
const bool isTrivial = lcoord_trans.IsTrivial();
auto &var = bnd_info(b).var;
const auto [tel, ftemp] =
lcoord_trans.InverseTransform(bnd_info(b).topo_idx[it]);
const bool isCell = (tel == TopologicalElement::CC);
Real fac = ftemp; // Can't capture structured bindings
const int iel = static_cast<int>(tel) % 3;
const int Ni = idxer.template EndIdx<5>() - idxer.template StartIdx<5>() + 1;
if (bnd_info(b).buf_allocated && bnd_info(b).allocated) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
// Have to do this because of some weird issue about structure bindings
// being captured
const int tt = t;
const int uu = u;
const int vv = v;
const int kk = k;
const int jj = j;
const int ii = i;
Kokkos::parallel_for(
Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) {
const auto [il, jl, kl] =
lcoord_trans.InverseTransform({ii + m, jj, kk});
if (idxer.IsActive(kl, jl, il))
var(iel, tt, uu, vv, kl, jl, il) = fac * buf[m];
});
});
if (isTrivial && isCell) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var_ptr = &var(iel, t, u, v, k, j, i);
Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { var_ptr[m] = buf[m]; });
});
} else if (isTrivial) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
// Have to do this because of some weird issue about structure
// bindings being captured
const int kk = k;
const int jj = j;
const int ii = i;
Real *var_ptr = &var(iel, t, u, v, kk, jj, ii);
Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) {
if (idxer.IsActive(kk, jj, ii + m))
var_ptr[m] = buf[m];
});
});
} else {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
Real *buf = &bnd_info(b).buf(idx * Ni + idx_offset);
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
// Have to do this because of some weird issue about structure
// bindings being captured
const int tt = t;
const int uu = u;
const int vv = v;
const int kk = k;
const int jj = j;
const int ii = i;
Kokkos::parallel_for(
Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) {
const auto [il, jl, kl] =
lcoord_trans.InverseTransform({ii + m, jj, kk});
if (idxer.IsActive(kl, jl, il))
var(iel, tt, uu, vv, kl, jl, il) = fac * buf[m];
});
});
}
} else if (bnd_info(b).allocated && bound_type != BoundaryType::flxcor_recv) {
const Real default_val = bnd_info(b).var.sparse_default_val;
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
const int tt = t;
const int uu = u;
const int vv = v;
const int kk = k;
const int jj = j;
const int ii = i;
Kokkos::parallel_for(
Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) {
const auto [il, jl, kl] =
lcoord_trans.InverseTransform({ii + m, jj, kk});
if (idxer.IsActive(kl, jl, il))
var(iel, tt, uu, vv, kl, jl, il) = default_val;
});
});
if (isTrivial && isCell) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
Real *var_ptr = &var(iel, t, u, v, k, j, i);
Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) { var_ptr[m] = default_val; });
});
} else if (isTrivial) {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
const int kk = k;
const int jj = j;
const int ii = i;
Real *var_ptr = &var(iel, t, u, v, k, j, i);
Kokkos::parallel_for(Kokkos::ThreadVectorRange<>(team_member, Ni),
[&](int m) {
if (idxer.IsActive(kk, jj, ii + m))
var_ptr[m] = default_val;
});
});
} else {
Kokkos::parallel_for(
Kokkos::TeamThreadRange<>(team_member, idxer.size() / Ni),
[&](const int idx) {
const auto [t, u, v, k, j, i] = idxer(idx * Ni);
const int tt = t;
const int uu = u;
const int vv = v;
const int kk = k;
const int jj = j;
const int ii = i;
Kokkos::parallel_for(
Kokkos::ThreadVectorRange<>(team_member, Ni), [&](int m) {
const auto [il, jl, kl] =
lcoord_trans.InverseTransform({ii + m, jj, kk});
if (idxer.IsActive(kl, jl, il))
var(iel, tt, uu, vv, kl, jl, il) = default_val;
});
});
}
}
idx_offset += idxer.size();
}
Expand Down
7 changes: 7 additions & 0 deletions src/mesh/forest/logical_coordinate_transformation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ struct LogicalCoordinateTransformation {
std::int64_t origin) const;
CellCentOffsets Transform(CellCentOffsets in) const;

// Check if this transformation includes at most a translation
KOKKOS_INLINE_FUNCTION
bool IsTrivial() const {
return (dir_connection[0] == 0) && (dir_connection[1] == 1) &&
(dir_connection[2] == 2) && !dir_flip[0] && !dir_flip[1] && !dir_flip[2];
}

KOKKOS_INLINE_FUNCTION
std::tuple<TopologicalElement, Real> Transform(TopologicalElement el) const {
int iel = static_cast<int>(el);
Expand Down
2 changes: 2 additions & 0 deletions src/mesh/mesh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
default_pack_size_(pin->GetOrAddInteger("parthenon/mesh", "pack_size", -1)),
// private members:
num_mesh_threads_(pin->GetOrAddInteger("parthenon/mesh", "num_threads", 1)),
comm_vector_length_(
pin->GetOrAddInteger("parthenon/mesh", "comm_vector_length", 1)),
use_uniform_meshgen_fn_{true, true, true, true}, lb_flag_(true), lb_automatic_(),
lb_manual_(), nslist(Globals::nranks), nblist(Globals::nranks),
nref(Globals::nranks), nderef(Globals::nranks), rdisp(Globals::nranks),
Expand Down
3 changes: 3 additions & 0 deletions src/mesh/mesh.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ class Mesh {
return nblist[my_rank];
}
int GetNumMeshThreads() const { return num_mesh_threads_; }
int GetCommVectorLength() const { return comm_vector_length_; }

std::int64_t GetTotalCells();
// TODO(JMM): Move block_size into mesh.
int GetNumberOfMeshBlockCells() const;
Expand Down Expand Up @@ -258,6 +260,7 @@ class Mesh {
// data
int root_level, max_level, current_level;
int num_mesh_threads_;
int comm_vector_length_;
/// Maps Global Block IDs to which rank the block is mapped to.
std::vector<int> ranklist;
/// Maps rank to start of local block IDs.
Expand Down

0 comments on commit acafaaf

Please sign in to comment.