From 863ec24290ba52a390709634ad9e335b7d555605 Mon Sep 17 00:00:00 2001 From: jdolence Date: Tue, 31 Oct 2023 10:59:25 -0600 Subject: [PATCH 01/16] wrap profiling calls in a macro/struct --- benchmarks/burgers/burgers_package.cpp | 6 +- example/advection/advection_package.cpp | 3 +- example/particles/particles.cpp | 15 +- example/poisson/poisson_package.cpp | 6 +- .../sparse_advection_package.cpp | 3 +- .../stochastic_subgrid_package.cpp | 8 +- src/amr_criteria/refinement_package.cpp | 6 +- src/bvals/boundary_conditions.cpp | 3 +- src/bvals/bvals_base.cpp | 5 +- src/bvals/comms/boundary_communication.cpp | 17 +- src/bvals/comms/build_boundary_buffers.cpp | 3 +- src/bvals/comms/flux_correction.cpp | 15 +- src/driver/driver.cpp | 79 +- src/driver/multistage.hpp | 6 +- src/interface/meshblock_data.cpp | 3 +- src/interface/swarm_container.cpp | 26 +- src/interface/update.cpp | 4 +- src/interface/update.hpp | 60 +- src/kokkos_abstraction.hpp | 13 +- src/mesh/amr_loadbalance.cpp | 420 +++++----- src/mesh/mesh.cpp | 4 +- src/outputs/outputs.cpp | 3 +- src/outputs/parthenon_hdf5.cpp | 740 +++++++++--------- src/parthenon/prelude.hpp | 1 + 24 files changed, 692 insertions(+), 757 deletions(-) diff --git a/benchmarks/burgers/burgers_package.cpp b/benchmarks/burgers/burgers_package.cpp index 64201440b80b..b66bc69c2430 100644 --- a/benchmarks/burgers/burgers_package.cpp +++ b/benchmarks/burgers/burgers_package.cpp @@ -127,7 +127,7 @@ void CalculateDerived(MeshData *md) { // provide the routine that estimates a stable timestep for this package Real EstimateTimestepMesh(MeshData *md) { - Kokkos::Profiling::pushRegion("Task_burgers_EstimateTimestepMesh"); + PARTHENON_INSTRUMENT auto pm = md->GetParentPointer(); IndexRange ib = md->GetBoundsI(IndexDomain::interior); IndexRange jb = md->GetBoundsJ(IndexDomain::interior); @@ -155,14 +155,13 @@ Real EstimateTimestepMesh(MeshData *md) { }, Kokkos::Min(min_dt)); - Kokkos::Profiling::popRegion(); // Task_burgers_EstimateTimestepMesh return cfl * min_dt; } TaskStatus CalculateFluxes(MeshData *md) { using parthenon::ScratchPad1D; using parthenon::team_mbr_t; - Kokkos::Profiling::pushRegion("Task_burgers_CalculateFluxes"); + PARTHENON_INSTRUMENT auto pm = md->GetParentPointer(); const int ndim = pm->ndim; @@ -360,7 +359,6 @@ TaskStatus CalculateFluxes(MeshData *md) { } }); - Kokkos::Profiling::popRegion(); // Task_burgers_CalculateFluxes return TaskStatus::complete; } diff --git a/example/advection/advection_package.cpp b/example/advection/advection_package.cpp index f618de1758c3..52f8d2c69bda 100644 --- a/example/advection/advection_package.cpp +++ b/example/advection/advection_package.cpp @@ -438,7 +438,7 @@ Real EstimateTimestepBlock(MeshBlockData *rc) { TaskStatus CalculateFluxes(std::shared_ptr> &rc) { using parthenon::MetadataFlag; - Kokkos::Profiling::pushRegion("Task_Advection_CalculateFluxes"); + PARTHENON_INSTRUMENT auto pmb = rc->GetBlockPointer(); IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior); @@ -581,7 +581,6 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { }); } - Kokkos::Profiling::popRegion(); // Task_Advection_CalculateFluxes return TaskStatus::complete; } diff --git a/example/particles/particles.cpp b/example/particles/particles.cpp index 5e5648518d3e..5a112977cc23 100644 --- a/example/particles/particles.cpp +++ b/example/particles/particles.cpp @@ -130,7 +130,7 @@ Real EstimateTimestepBlock(MeshBlockData *rc) { // first some helper tasks TaskStatus DestroySomeParticles(MeshBlock *pmb) { - Kokkos::Profiling::pushRegion("Task_Particles_DestroySomeParticles"); + PARTHENON_INSTRUMENT auto pkg = pmb->packages.Get("particles_package"); auto swarm = pmb->swarm_data.Get()->Get("my_particles"); @@ -156,7 +156,6 @@ TaskStatus DestroySomeParticles(MeshBlock *pmb) { // Remove marked particles swarm->RemoveMarkedParticles(); - Kokkos::Profiling::popRegion(); // Task_Particles_DestroySomeParticles return TaskStatus::complete; } @@ -172,7 +171,7 @@ TaskStatus SortParticlesIfUsingPerCellDeposition(MeshBlock *pmb) { } TaskStatus DepositParticles(MeshBlock *pmb) { - Kokkos::Profiling::pushRegion("Task_Particles_DepositParticles"); + PARTHENON_INSTRUMENT auto swarm = pmb->swarm_data.Get()->Get("my_particles"); @@ -238,12 +237,11 @@ TaskStatus DepositParticles(MeshBlock *pmb) { }); } - Kokkos::Profiling::popRegion(); // Task_Particles_DepositParticles return TaskStatus::complete; } TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) { - Kokkos::Profiling::pushRegion("Task_Particles_CreateSomeParticles"); + PARTHENON_INSTRUMENT auto pkg = pmb->packages.Get("particles_package"); auto swarm = pmb->swarm_data.Get()->Get("my_particles"); @@ -351,13 +349,12 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) { }); } - Kokkos::Profiling::popRegion(); // Task_Particles_CreateSomeParticles return TaskStatus::complete; } TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator, const double t0) { - Kokkos::Profiling::pushRegion("Task_Particles_TransportParticles"); + PARTHENON_INSTRUMENT auto swarm = pmb->swarm_data.Get()->Get("my_particles"); auto pkg = pmb->packages.Get("particles_package"); @@ -478,7 +475,6 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator }); } - Kokkos::Profiling::popRegion(); // Task_Particles_TransportParticles return TaskStatus::complete; } @@ -520,7 +516,7 @@ TaskListStatus ParticleDriver::Step() { // TODO(BRR) This should really be in parthenon/src... but it can't just live in Swarm // because of the loop over blocks TaskStatus StopCommunicationMesh(const BlockList_t &blocks) { - Kokkos::Profiling::pushRegion("Task_Particles_StopCommunicationMesh"); + PARTHENON_INSTRUMENT int num_sent_local = 0; for (auto &block : blocks) { @@ -574,7 +570,6 @@ TaskStatus StopCommunicationMesh(const BlockList_t &blocks) { } } - Kokkos::Profiling::popRegion(); // Task_Particles_StopCommunicationMesh return TaskStatus::complete; } diff --git a/example/poisson/poisson_package.cpp b/example/poisson/poisson_package.cpp index 24d60613981f..378bc48a3176 100644 --- a/example/poisson/poisson_package.cpp +++ b/example/poisson/poisson_package.cpp @@ -207,7 +207,7 @@ TaskStatus SumDeltaPhi(T *du, Real *reduce_sum) { template TaskStatus UpdatePhi(T *u, T *du) { using Stencil_t = parthenon::solvers::Stencil; - Kokkos::Profiling::pushRegion("Task_Poisson_UpdatePhi"); + PARTHENON_INSTRUMENT auto pm = u->GetParentPointer(); IndexRange ib = u->GetBoundsI(IndexDomain::interior); @@ -274,13 +274,12 @@ TaskStatus UpdatePhi(T *u, T *du) { v(b, iphi, k, j, i) += dv(b, idphi, k, j, i); }); - Kokkos::Profiling::popRegion(); // Task_Poisson_UpdatePhi return TaskStatus::complete; } template TaskStatus CheckConvergence(T *u, T *du) { - Kokkos::Profiling::pushRegion("Task_Poisson_UpdatePhi"); + PARTHENON_INSTRUMENT auto pm = u->GetParentPointer(); IndexRange ib = u->GetBoundsI(IndexDomain::interior); @@ -311,7 +310,6 @@ TaskStatus CheckConvergence(T *u, T *du) { auto status = (max_err < err_tol ? TaskStatus::complete : TaskStatus::iterate); - Kokkos::Profiling::popRegion(); // Task_Poisson_CheckConvergence return status; } diff --git a/example/sparse_advection/sparse_advection_package.cpp b/example/sparse_advection/sparse_advection_package.cpp index 01ec9c62475b..b3f8130cbe20 100644 --- a/example/sparse_advection/sparse_advection_package.cpp +++ b/example/sparse_advection/sparse_advection_package.cpp @@ -182,7 +182,7 @@ Real EstimateTimestepBlock(MeshBlockData *rc) { TaskStatus CalculateFluxes(std::shared_ptr> &rc) { using parthenon::MetadataFlag; - Kokkos::Profiling::pushRegion("Task_Advection_CalculateFluxes"); + PARTHENON_INSTRUMENT auto pmb = rc->GetBlockPointer(); IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior); @@ -257,7 +257,6 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { PARTHENON_REQUIRE_THROWS(pmb->pmy_mesh->ndim == 2, "Sparse Advection example must be 2D"); - Kokkos::Profiling::popRegion(); // Task_Advection_CalculateFluxes return TaskStatus::complete; } diff --git a/example/stochastic_subgrid/stochastic_subgrid_package.cpp b/example/stochastic_subgrid/stochastic_subgrid_package.cpp index 6ab1aad2501c..79be429a3232 100644 --- a/example/stochastic_subgrid/stochastic_subgrid_package.cpp +++ b/example/stochastic_subgrid/stochastic_subgrid_package.cpp @@ -245,11 +245,9 @@ AmrTag CheckRefinement(MeshBlockData *rc) { // randomly sample an interation number for each cell from the discrete power-law // distribution TaskStatus ComputeNumIter(std::shared_ptr> &md, Packages_t &packages) { - Kokkos::Profiling::pushRegion("Task_ComputeNumIter"); + PARTHENON_INSTRUMENT - Kokkos::Profiling::pushRegion("Task_ComputeNumIter_pack"); auto pack = md->PackVariables(std::vector({"num_iter"})); - Kokkos::Profiling::popRegion(); auto pkg = packages.Get("stochastic_subgrid_package"); const auto &pool = @@ -276,7 +274,6 @@ TaskStatus ComputeNumIter(std::shared_ptr> &md, Packages_t &packa pack(b, v, k, j, i) = num_iter; }); - Kokkos::Profiling::popRegion(); // Task_ComputeNumIter return TaskStatus::complete; } @@ -364,7 +361,7 @@ Real EstimateTimestepBlock(MeshBlockData *rc) { // some field "advected" that we are pushing around. // This routine implements all the "physics" in this example TaskStatus CalculateFluxes(std::shared_ptr> &rc) { - Kokkos::Profiling::pushRegion("Task_Advection_CalculateFluxes"); + PARTHENON_INSTRUMENT auto pmb = rc->GetBlockPointer(); const IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior); const IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior); @@ -467,7 +464,6 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { }); } - Kokkos::Profiling::popRegion(); // Task_Advection_CalculateFluxes return TaskStatus::complete; } diff --git a/src/amr_criteria/refinement_package.cpp b/src/amr_criteria/refinement_package.cpp index 285d01c71c21..bc6744cab109 100644 --- a/src/amr_criteria/refinement_package.cpp +++ b/src/amr_criteria/refinement_package.cpp @@ -153,19 +153,17 @@ void SetRefinement_(MeshBlockData *rc) { template <> TaskStatus Tag(MeshBlockData *rc) { - Kokkos::Profiling::pushRegion("Task_Tag_Block"); + PARTHENON_INSTRUMENT SetRefinement_(rc); - Kokkos::Profiling::popRegion(); // Task_Tag_Block return TaskStatus::complete; } template <> TaskStatus Tag(MeshData *rc) { - Kokkos::Profiling::pushRegion("Task_Tag_Mesh"); + PARTHENON_INSTRUMENT for (int i = 0; i < rc->NumBlocks(); i++) { SetRefinement_(rc->GetBlockData(i).get()); } - Kokkos::Profiling::popRegion(); // Task_Tag_Mesh return TaskStatus::complete; } diff --git a/src/bvals/boundary_conditions.cpp b/src/bvals/boundary_conditions.cpp index bac842d8614b..f453f256bedd 100644 --- a/src/bvals/boundary_conditions.cpp +++ b/src/bvals/boundary_conditions.cpp @@ -33,7 +33,7 @@ bool DoPhysicalBoundary_(const BoundaryFlag flag, const BoundaryFace face, TaskStatus ApplyBoundaryConditionsOnCoarseOrFine(std::shared_ptr> &rc, bool coarse) { - Kokkos::Profiling::pushRegion("Task_ApplyBoundaryConditionsOnCoarseOrFine"); + PARTHENON_INSTRUMENT using namespace boundary_cond_impl; std::shared_ptr pmb = rc->GetBlockPointer(); Mesh *pmesh = pmb->pmy_mesh; @@ -47,7 +47,6 @@ TaskStatus ApplyBoundaryConditionsOnCoarseOrFine(std::shared_ptr &newly_refined) { - Kokkos::Profiling::pushRegion("SearchAndSetNeighbors"); + PARTHENON_INSTRUMENT MeshBlockTree *neibt; int myox1, myox2 = 0, myox3 = 0, myfx1, myfx2, myfx3; myfx1 = ((loc.lx1() & 1LL) == 1LL); @@ -371,7 +371,6 @@ void BoundaryBase::SearchAndSetNeighbors( } if (block_size_.nx(X2DIR) == 1) { SetNeighborOwnership(newly_refined); - Kokkos::Profiling::popRegion(); // SearchAndSetNeighbors return; } @@ -506,7 +505,6 @@ void BoundaryBase::SearchAndSetNeighbors( if (block_size_.nx(X3DIR) == 1) { SetNeighborOwnership(newly_refined); - Kokkos::Profiling::popRegion(); // SearchAndSetNeighbors return; } @@ -629,7 +627,6 @@ void BoundaryBase::SearchAndSetNeighbors( } SetNeighborOwnership(newly_refined); - Kokkos::Profiling::popRegion(); // SearchAndSetNeighbors } void BoundaryBase::SetNeighborOwnership( diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp index fc5231c2da79..4f26a9f52809 100644 --- a/src/bvals/comms/boundary_communication.cpp +++ b/src/bvals/comms/boundary_communication.cpp @@ -46,7 +46,7 @@ using namespace loops::shorthands; template TaskStatus SendBoundBufs(std::shared_ptr> &md) { - Kokkos::Profiling::pushRegion("Task_LoadAndSendBoundBufs"); + PARTHENON_INSTRUMENT Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(bound_type, true); @@ -59,11 +59,9 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { CheckSendBufferCacheForRebuild(md); if (nbound == 0) { - Kokkos::Profiling::popRegion(); // Task_LoadAndSendBoundBufs return TaskStatus::complete; } if (other_communication_unfinished) { - Kokkos::Profiling::popRegion(); // Task_LoadAndSendBoundBufs return TaskStatus::incomplete; } @@ -144,7 +142,6 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { buf.SendNull(); } - Kokkos::Profiling::popRegion(); // Task_LoadAndSendBoundBufs return TaskStatus::complete; } @@ -155,7 +152,7 @@ SendBoundBufs(std::shared_ptr> &); template TaskStatus StartReceiveBoundBufs(std::shared_ptr> &md) { - Kokkos::Profiling::pushRegion("Task_StartReceiveBoundBufs"); + PARTHENON_INSTRUMENT Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false); if (cache.buf_vec.size() == 0) @@ -165,7 +162,6 @@ TaskStatus StartReceiveBoundBufs(std::shared_ptr> &md) { std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec), [](auto pbuf) { pbuf->TryStartReceive(); }); - Kokkos::Profiling::popRegion(); // Task_StartReceiveBoundBufs return TaskStatus::complete; } @@ -178,7 +174,7 @@ StartReceiveBoundBufs(std::shared_ptr> &) template TaskStatus ReceiveBoundBufs(std::shared_ptr> &md) { - Kokkos::Profiling::pushRegion("Task_ReceiveBoundBufs"); + PARTHENON_INSTRUMENT Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false); @@ -209,7 +205,6 @@ TaskStatus ReceiveBoundBufs(std::shared_ptr> &md) { ++ibound; }); } - Kokkos::Profiling::popRegion(); // Task_ReceiveBoundBufs if (all_received) return TaskStatus::complete; return TaskStatus::incomplete; } @@ -223,7 +218,7 @@ ReceiveBoundBufs(std::shared_ptr> &); template TaskStatus SetBounds(std::shared_ptr> &md) { - Kokkos::Profiling::pushRegion("Task_SetInternalBoundaries"); + PARTHENON_INSTRUMENT Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false); @@ -294,7 +289,6 @@ TaskStatus SetBounds(std::shared_ptr> &md) { refinement::Restrict(resolved_packages, cache.prores_cache, pmb->cellbounds, pmb->c_cellbounds); } - Kokkos::Profiling::popRegion(); // Task_SetInternalBoundaries return TaskStatus::complete; } @@ -304,7 +298,7 @@ template TaskStatus SetBounds(std::shared_ptr TaskStatus ProlongateBounds(std::shared_ptr> &md) { - Kokkos::Profiling::pushRegion("Task_ProlongateBoundaries"); + PARTHENON_INSTRUMENT Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false); @@ -323,7 +317,6 @@ TaskStatus ProlongateBounds(std::shared_ptr> &md) { refinement::ProlongateInternal(resolved_packages, cache.prores_cache, pmb->cellbounds, pmb->c_cellbounds); } - Kokkos::Profiling::popRegion(); // Task_ProlongateBoundaries return TaskStatus::complete; } diff --git a/src/bvals/comms/build_boundary_buffers.cpp b/src/bvals/comms/build_boundary_buffers.cpp index 01868f22ec9b..3eabe3c4d284 100644 --- a/src/bvals/comms/build_boundary_buffers.cpp +++ b/src/bvals/comms/build_boundary_buffers.cpp @@ -113,7 +113,7 @@ void BuildBoundaryBufferSubset(std::shared_ptr> &md, // pmesh->boundary_comm_map.clear() after every remesh // in InitializeBlockTimeStepsAndBoundaries() TaskStatus BuildBoundaryBuffers(std::shared_ptr> &md) { - Kokkos::Profiling::pushRegion("Task_BuildSendBoundBufs"); + PARTHENON_INSTRUMENT Mesh *pmesh = md->GetMeshPointer(); auto &all_caches = md->GetBvarsCache(); @@ -127,7 +127,6 @@ TaskStatus BuildBoundaryBuffers(std::shared_ptr> &md) { BuildBoundaryBufferSubset(md, pmesh->boundary_comm_flxcor_map); - Kokkos::Profiling::popRegion(); // "Task_BuildSendBoundBufs" return TaskStatus::complete; } } // namespace parthenon diff --git a/src/bvals/comms/flux_correction.cpp b/src/bvals/comms/flux_correction.cpp index 5a82a15429e6..50166ec9ca89 100644 --- a/src/bvals/comms/flux_correction.cpp +++ b/src/bvals/comms/flux_correction.cpp @@ -39,7 +39,7 @@ namespace parthenon { using namespace impl; TaskStatus LoadAndSendFluxCorrections(std::shared_ptr> &md) { - Kokkos::Profiling::pushRegion("Task_LoadAndSendFluxCorrections"); + PARTHENON_INSTRUMENT Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_send, true); @@ -53,12 +53,10 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr> &md) { CheckSendBufferCacheForRebuild(md); if (nbound == 0) { - Kokkos::Profiling::popRegion(); // Task_LoadAndSendFluxCorrections return TaskStatus::complete; } if (other_communication_unfinished) { - Kokkos::Profiling::popRegion(); // Task_LoadAndSendFluxCorrections return TaskStatus::incomplete; } @@ -116,12 +114,11 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr> &md) { // Calling Send will send null if the underlying buffer is unallocated for (auto &buf : cache.buf_vec) buf->Send(); - Kokkos::Profiling::popRegion(); // Task_LoadAndSendFluxCorrections return TaskStatus::complete; } TaskStatus StartReceiveFluxCorrections(std::shared_ptr> &md) { - Kokkos::Profiling::pushRegion("Task_StartReceiveFluxCorrections"); + PARTHENON_INSTRUMENT Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false); if (cache.buf_vec.size() == 0) @@ -131,12 +128,11 @@ TaskStatus StartReceiveFluxCorrections(std::shared_ptr> &md) { std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec), [](auto pbuf) { pbuf->TryStartReceive(); }); - Kokkos::Profiling::popRegion(); // Task_StartReceiveFluxCorrections return TaskStatus::complete; } TaskStatus ReceiveFluxCorrections(std::shared_ptr> &md) { - Kokkos::Profiling::pushRegion("Task_ReceiveFluxCorrections"); + PARTHENON_INSTRUMENT Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false); @@ -149,14 +145,12 @@ TaskStatus ReceiveFluxCorrections(std::shared_ptr> &md) { std::begin(cache.buf_vec), std::end(cache.buf_vec), [&all_received](auto pbuf) { all_received = pbuf->TryReceive() && all_received; }); - Kokkos::Profiling::popRegion(); // Task_ReceiveFluxCorrections - if (all_received) return TaskStatus::complete; return TaskStatus::incomplete; } TaskStatus SetFluxCorrections(std::shared_ptr> &md) { - Kokkos::Profiling::pushRegion("Task_SetFluxCorrections"); + PARTHENON_INSTRUMENT Mesh *pmesh = md->GetMeshPointer(); auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false); @@ -188,7 +182,6 @@ TaskStatus SetFluxCorrections(std::shared_ptr> &md) { std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec), [](auto pbuf) { pbuf->Stale(); }); - Kokkos::Profiling::popRegion(); // Task_SetFluxCorrections return TaskStatus::complete; } diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp index 84371eeaeb36..8359e6fc9a82 100644 --- a/src/driver/driver.cpp +++ b/src/driver/driver.cpp @@ -75,51 +75,54 @@ DriverStatus EvolutionDriver::Execute() { // Defaults must be set across all ranks DumpInputParameters(); - Kokkos::Profiling::pushRegion("Driver_Main"); - while (tm.KeepGoing()) { - if (Globals::my_rank == 0) OutputCycleDiagnostics(); - - pmesh->PreStepUserWorkInLoop(pmesh, pinput, tm); - pmesh->PreStepUserDiagnosticsInLoop(pmesh, pinput, tm); - - TaskListStatus status = Step(); - if (status != TaskListStatus::complete) { - std::cerr << "Step failed to complete all tasks." << std::endl; - return DriverStatus::failed; - } + { // Main t < tmax loop region + PARTHENON_INSTRUMENT_REGION("evolution_loop") + while (tm.KeepGoing()) { + if (Globals::my_rank == 0) OutputCycleDiagnostics(); + + pmesh->PreStepUserWorkInLoop(pmesh, pinput, tm); + pmesh->PreStepUserDiagnosticsInLoop(pmesh, pinput, tm); + + TaskListStatus status = Step(); + if (status != TaskListStatus::complete) { + std::cerr << "Step failed to complete all tasks." << std::endl; + return DriverStatus::failed; + } - pmesh->PostStepUserWorkInLoop(pmesh, pinput, tm); - pmesh->PostStepUserDiagnosticsInLoop(pmesh, pinput, tm); + pmesh->PostStepUserWorkInLoop(pmesh, pinput, tm); + pmesh->PostStepUserDiagnosticsInLoop(pmesh, pinput, tm); - tm.ncycle++; - tm.time += tm.dt; - pmesh->mbcnt += pmesh->nbtotal; - pmesh->step_since_lb++; + tm.ncycle++; + tm.time += tm.dt; + pmesh->mbcnt += pmesh->nbtotal; + pmesh->step_since_lb++; - timer_LBandAMR.reset(); - pmesh->LoadBalancingAndAdaptiveMeshRefinement(pinput, app_input); - if (pmesh->modified) InitializeBlockTimeStepsAndBoundaries(); - time_LBandAMR += timer_LBandAMR.seconds(); - SetGlobalTimeStep(); + timer_LBandAMR.reset(); + pmesh->LoadBalancingAndAdaptiveMeshRefinement(pinput, app_input); + if (pmesh->modified) InitializeBlockTimeStepsAndBoundaries(); + time_LBandAMR += timer_LBandAMR.seconds(); + SetGlobalTimeStep(); - // check for signals - signal = SignalHandler::CheckSignalFlags(); + // check for signals + signal = SignalHandler::CheckSignalFlags(); - if (signal == OutputSignal::final) { - break; - } + if (signal == OutputSignal::final) { + break; + } - // skip the final (last) output at the end of the simulation time as it happens later - if (tm.KeepGoing()) { - pouts->MakeOutputs(pmesh, pinput, &tm, signal); - } + // skip the final (last) output at the end of the simulation time as it happens + // later + if (tm.KeepGoing()) { + pouts->MakeOutputs(pmesh, pinput, &tm, signal); + } - if (tm.ncycle == perf_cycle_offset) { - pmesh->mbcnt = 0; - timer_main.reset(); - } - } // END OF MAIN INTEGRATION LOOP ====================================================== - Kokkos::Profiling::popRegion(); // Driver_Main + if (tm.ncycle == perf_cycle_offset) { + pmesh->mbcnt = 0; + timer_main.reset(); + } + } // END OF MAIN INTEGRATION LOOP + // ====================================================== + } // Main t < tmax loop region pmesh->UserWorkAfterLoop(pmesh, pinput, tm); diff --git a/src/driver/multistage.hpp b/src/driver/multistage.hpp index 07748ae3564c..9cc1fcf5b2c5 100644 --- a/src/driver/multistage.hpp +++ b/src/driver/multistage.hpp @@ -37,7 +37,7 @@ class MultiStageDriverGeneric : public EvolutionDriver { // the dependencies that must be executed. virtual TaskCollection MakeTaskCollection(BlockList_t &blocks, int stage) = 0; virtual TaskListStatus Step() { - Kokkos::Profiling::pushRegion("MultiStage_Step"); + PARTHENON_INSTRUMENT using DriverUtils::ConstructAndExecuteTaskLists; TaskListStatus status; integrator->dt = tm.dt; @@ -49,7 +49,6 @@ class MultiStageDriverGeneric : public EvolutionDriver { status = ConstructAndExecuteTaskLists<>(this, stage); if (status != TaskListStatus::complete) break; } - Kokkos::Profiling::popRegion(); // MultiStage_Step return status; } @@ -66,7 +65,7 @@ class MultiStageBlockTaskDriverGeneric : public MultiStageDriverGeneric(pin, app_in, pm) {} virtual TaskList MakeTaskList(MeshBlock *pmb, int stage) = 0; virtual TaskListStatus Step() { - Kokkos::Profiling::pushRegion("MultiStageBlockTask_Step"); + PARTHENON_INSTRUMENT using DriverUtils::ConstructAndExecuteBlockTasks; TaskListStatus status; Integrator *integrator = (this->integrator).get(); @@ -76,7 +75,6 @@ class MultiStageBlockTaskDriverGeneric : public MultiStageDriverGeneric(this, stage); if (status != TaskListStatus::complete) break; } - Kokkos::Profiling::popRegion(); // MultiStageBlockTask_Step return status; } diff --git a/src/interface/meshblock_data.cpp b/src/interface/meshblock_data.cpp index 1b194597911b..5f685c0e119d 100644 --- a/src/interface/meshblock_data.cpp +++ b/src/interface/meshblock_data.cpp @@ -327,7 +327,7 @@ template typename MeshBlockData::VarList MeshBlockData::GetVariablesByFlag(const Metadata::FlagCollection &flags, const std::vector &sparse_ids) { - Kokkos::Profiling::pushRegion("GetVariablesByFlag"); + PARTHENON_INSTRUMENT typename MeshBlockData::VarList var_list; std::unordered_set sparse_ids_set(sparse_ids.begin(), sparse_ids.end()); @@ -338,7 +338,6 @@ MeshBlockData::GetVariablesByFlag(const Metadata::FlagCollection &flags, var_list.Add(v, sparse_ids_set); } - Kokkos::Profiling::popRegion(); // GetVariablesByFlag return var_list; } diff --git a/src/interface/swarm_container.cpp b/src/interface/swarm_container.cpp index 6abb9de69721..056d742bcea7 100644 --- a/src/interface/swarm_container.cpp +++ b/src/interface/swarm_container.cpp @@ -83,16 +83,15 @@ void SwarmContainer::Remove(const std::string &label) { // Return swarms meeting some conditions SwarmSet SwarmContainer::GetSwarmsByFlag(const Metadata::FlagCollection &flags) { - Kokkos::Profiling::pushRegion("GetSwarmsByFlag"); + PARTHENON_INSTRUMENT auto swarms = MetadataUtils::GetByFlag(flags, swarmMap_, swarmMetadataMap_); - Kokkos::Profiling::popRegion(); // GetSwarmsByFlag return swarms; } TaskStatus SwarmContainer::Defrag(double min_occupancy) { - Kokkos::Profiling::pushRegion("Task_SwarmContainer_Defrag"); + PARTHENON_INSTRUMENT PARTHENON_REQUIRE_THROWS(min_occupancy >= 0. && min_occupancy <= 1., "Max fractional occupancy of swarm must be >= 0 and <= 1"); @@ -103,29 +102,24 @@ TaskStatus SwarmContainer::Defrag(double min_occupancy) { } } - Kokkos::Profiling::popRegion(); - return TaskStatus::complete; } TaskStatus SwarmContainer::DefragAll() { - Kokkos::Profiling::pushRegion("Task_SwarmContainer_Defrag"); + PARTHENON_INSTRUMENT for (auto &s : swarmVector_) { s->Defrag(); } - Kokkos::Profiling::popRegion(); return TaskStatus::complete; } TaskStatus SwarmContainer::SortParticlesByCell() { - Kokkos::Profiling::pushRegion("Task_SwarmContainer_SortParticlesByCell"); + PARTHENON_INSTRUMENT for (auto &s : swarmVector_) { s->SortParticlesByCell(); } - Kokkos::Profiling::popRegion(); - return TaskStatus::complete; } @@ -150,18 +144,17 @@ void SwarmContainer::AllocateBoundaries() { } TaskStatus SwarmContainer::Send(BoundaryCommSubset phase) { - Kokkos::Profiling::pushRegion("Task_SwarmContainer_Send"); + PARTHENON_INSTRUMENT for (auto &s : swarmVector_) { s->Send(phase); } - Kokkos::Profiling::popRegion(); // Task_SwarmContainer_Send return TaskStatus::complete; } TaskStatus SwarmContainer::Receive(BoundaryCommSubset phase) { - Kokkos::Profiling::pushRegion("Task_SwarmContainer_Receive"); + PARTHENON_INSTRUMENT int success = 0, total = 0; for (auto &s : swarmVector_) { @@ -171,24 +164,22 @@ TaskStatus SwarmContainer::Receive(BoundaryCommSubset phase) { total++; } - Kokkos::Profiling::popRegion(); // Task_SwarmContainer_Receive if (success == total) return TaskStatus::complete; return TaskStatus::incomplete; } TaskStatus SwarmContainer::ResetCommunication() { - Kokkos::Profiling::pushRegion("Task_SwarmContainer_ResetCommunication"); + PARTHENON_INSTRUMENT for (auto &s : swarmVector_) { s->ResetCommunication(); } - Kokkos::Profiling::popRegion(); // Task_SwarmContainer_ResetCommunication return TaskStatus::complete; } TaskStatus SwarmContainer::FinalizeCommunicationIterative() { - Kokkos::Profiling::pushRegion("Task_SwarmContainer_FinalizeCommunicationIterative"); + PARTHENON_INSTRUMENT PARTHENON_THROW("FinalizeCommunicationIterative not yet fully implemented!") @@ -200,7 +191,6 @@ TaskStatus SwarmContainer::FinalizeCommunicationIterative() { total++; } - Kokkos::Profiling::popRegion(); // Task_SwarmContainer_FinalizeCommunicationIterative if (success == total) return TaskStatus::complete; return TaskStatus::incomplete; } diff --git a/src/interface/update.cpp b/src/interface/update.cpp index e4da9849f8d2..02c3b632926e 100644 --- a/src/interface/update.cpp +++ b/src/interface/update.cpp @@ -140,12 +140,11 @@ TaskStatus UpdateWithFluxDivergence(MeshData *u0_data, MeshData *u1_ } TaskStatus SparseDealloc(MeshData *md) { + PARTHENON_INSTRUMENT if (!Globals::sparse_config.enabled || (md->NumBlocks() == 0)) { return TaskStatus::complete; } - Kokkos::Profiling::pushRegion("Task_SparseDealloc"); - const IndexRange ib = md->GetBoundsI(IndexDomain::entire); const IndexRange jb = md->GetBoundsJ(IndexDomain::entire); const IndexRange kb = md->GetBoundsK(IndexDomain::entire); @@ -219,7 +218,6 @@ TaskStatus SparseDealloc(MeshData *md) { } } - Kokkos::Profiling::popRegion(); // Task_SparseDealloc return TaskStatus::complete; } diff --git a/src/interface/update.hpp b/src/interface/update.hpp index eb0dbc57d3e6..d2cc37343802 100644 --- a/src/interface/update.hpp +++ b/src/interface/update.hpp @@ -70,7 +70,7 @@ TaskStatus UpdateWithFluxDivergence(T *data_u0, T *data_u1, const Real gam0, template TaskStatus WeightedSumData(const F &flags, T *in1, T *in2, const Real w1, const Real w2, T *out) { - Kokkos::Profiling::pushRegion("Task_WeightedSumData"); + PARTHENON_INSTRUMENT const auto &x = in1->PackVariables(flags); const auto &y = in2->PackVariables(flags); const auto &z = out->PackVariables(flags); @@ -85,7 +85,6 @@ TaskStatus WeightedSumData(const F &flags, T *in1, T *in2, const Real w1, const z(b, l, k, j, i) = w1 * x(b, l, k, j, i) + w2 * y(b, l, k, j, i); } }); - Kokkos::Profiling::popRegion(); // Task_WeightedSumData return TaskStatus::complete; } @@ -96,7 +95,7 @@ TaskStatus CopyData(const F &flags, T *in, T *out) { template TaskStatus SetDataToConstant(const F &flags, T *data, const Real val) { - Kokkos::Profiling::pushRegion("Task_SetDataToConstant"); + PARTHENON_INSTRUMENT const auto &x = data->PackVariables(flags); parthenon::par_for( DEFAULT_LOOP_PATTERN, "SetDataToConstant", DevExecSpace(), 0, x.GetDim(5) - 1, 0, @@ -106,7 +105,6 @@ TaskStatus SetDataToConstant(const F &flags, T *data, const Real val) { x(b, l, k, j, i) = val; } }); - Kokkos::Profiling::popRegion(); // Task_SetDataToConstant return TaskStatus::complete; } @@ -148,7 +146,7 @@ template TaskStatus Update2S(const F &flags, T *s0_data, T *s1_data, T *rhs_data, const LowStorageIntegrator *pint, Real dt, int stage, bool update_s1) { - Kokkos::Profiling::pushRegion("Task_2S_Update"); + PARTHENON_INSTRUMENT const auto &s0 = s0_data->PackVariables(flags); const auto &s1 = s1_data->PackVariables(flags); const auto &rhs = rhs_data->PackVariables(flags); @@ -174,7 +172,6 @@ TaskStatus Update2S(const F &flags, T *s0_data, T *s1_data, T *rhs_data, beta * dt * rhs(b, l, k, j, i); } }); - Kokkos::Profiling::popRegion(); // Task_2S_Update return TaskStatus::complete; } template @@ -194,7 +191,7 @@ TaskStatus SumButcher(const F &flags, std::shared_ptr base_data, std::vector> stage_data, std::shared_ptr out_data, const ButcherIntegrator *pint, Real dt, int stage) { - Kokkos::Profiling::pushRegion("Task_Butcher_Sum"); + PARTHENON_INSTRUMENT const auto &out = out_data->PackVariables(flags); const auto &in = base_data->PackVariables(flags); const IndexDomain interior = IndexDomain::interior; @@ -221,7 +218,6 @@ TaskStatus SumButcher(const F &flags, std::shared_ptr base_data, } }); } - Kokkos::Profiling::popRegion(); // Task_Butcher_Sum return TaskStatus::complete; } template @@ -238,7 +234,7 @@ template TaskStatus UpdateButcher(const F &flags, std::vector> stage_data, std::shared_ptr out_data, const ButcherIntegrator *pint, Real dt) { - Kokkos::Profiling::pushRegion("Task_Butcher_Update"); + PARTHENON_INSTRUMENT const auto &out = out_data->PackVariables(flags); const IndexDomain interior = IndexDomain::interior; @@ -259,7 +255,6 @@ TaskStatus UpdateButcher(const F &flags, std::vector> stage_d } }); } - Kokkos::Profiling::popRegion(); // Task_Butcher_Update return TaskStatus::complete; } template @@ -272,53 +267,54 @@ TaskStatus UpdateButcherIndependent(std::vector> stage_data, template TaskStatus EstimateTimestep(T *rc) { - Kokkos::Profiling::pushRegion("Task_EstimateTimestep"); + PARTHENON_INSTRUMENT Real dt_min = std::numeric_limits::max(); for (const auto &pkg : rc->GetParentPointer()->packages.AllPackages()) { Real dt = pkg.second->EstimateTimestep(rc); dt_min = std::min(dt_min, dt); } rc->SetAllowedDt(dt_min); - Kokkos::Profiling::popRegion(); // Task_EstimateTimestep return TaskStatus::complete; } template TaskStatus PreCommFillDerived(T *rc) { - Kokkos::Profiling::pushRegion("Task_PreCommFillDerived"); + PARTHENON_INSTRUMENT auto pm = rc->GetParentPointer(); for (const auto &pkg : pm->packages.AllPackages()) { pkg.second->PreCommFillDerived(rc); } - Kokkos::Profiling::popRegion(); return TaskStatus::complete; } template TaskStatus FillDerived(T *rc) { - Kokkos::Profiling::pushRegion("Task_FillDerived"); + PARTHENON_INSTRUMENT auto pm = rc->GetParentPointer(); - Kokkos::Profiling::pushRegion("PreFillDerived"); - for (const auto &pkg : pm->packages.AllPackages()) { - pkg.second->PreFillDerived(rc); - } - Kokkos::Profiling::popRegion(); // PreFillDerived - Kokkos::Profiling::pushRegion("FillDerived"); - for (const auto &pkg : pm->packages.AllPackages()) { - pkg.second->FillDerived(rc); - } - Kokkos::Profiling::popRegion(); // FillDerived - Kokkos::Profiling::pushRegion("PostFillDerived"); - for (const auto &pkg : pm->packages.AllPackages()) { - pkg.second->PostFillDerived(rc); - } - Kokkos::Profiling::popRegion(); // PostFillDerived - Kokkos::Profiling::popRegion(); // Task_FillDerived + { // PreFillDerived region + PARTHENON_INSTRUMENT_REGION("PreFillDerived") + for (const auto &pkg : pm->packages.AllPackages()) { + pkg.second->PreFillDerived(rc); + } + } // PreFillDerived region + { // FillDerived region + PARTHENON_INSTRUMENT_REGION("FillDerived"); + for (const auto &pkg : pm->packages.AllPackages()) { + pkg.second->FillDerived(rc); + } + } // FillDerived region + { // PostFillDerived region + PARTHENON_INSTRUMENT_REGION("PostFillDerived"); + for (const auto &pkg : pm->packages.AllPackages()) { + pkg.second->PostFillDerived(rc); + } + } // PostFillDerived region return TaskStatus::complete; } template TaskStatus InitNewlyAllocatedVars(T *rc) { + PARTHENON_INSTRUMENT if (!rc->AllVariablesInitialized()) { const IndexDomain interior = IndexDomain::interior; const IndexRange ib = rc->GetBoundsI(interior); @@ -369,12 +365,10 @@ TaskStatus InitNewlyAllocatedVars(T *rc) { // This has to be done even in the case where no blocks have been allocated // since the boundaries of allocated blocks could have received default data // in any case - Kokkos::Profiling::pushRegion("Task_InitNewlyAllocatedVars"); auto pm = rc->GetParentPointer(); for (const auto &pkg : pm->packages.AllPackages()) { pkg.second->InitNewlyAllocatedVars(rc); } - Kokkos::Profiling::popRegion(); // Don't worry about flagging variables as initialized // since they will be flagged at the beginning of the diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp index bfaf6a247a66..7a2220fd421f 100644 --- a/src/kokkos_abstraction.hpp +++ b/src/kokkos_abstraction.hpp @@ -29,6 +29,7 @@ #include "parthenon_array_generic.hpp" #include "utils/error_checking.hpp" +#include "utils/instrument.hpp" #include "utils/object_pool.hpp" namespace parthenon { @@ -334,13 +335,12 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, DevExecSpace exec_space, const int &kl, const int &ku, const int &jl, const int &ju, const int &il, const int &iu, const Function &function) { - Kokkos::Profiling::pushRegion(name); + PARTHENON_INSTRUMENT_REGION(name) for (auto k = kl; k <= ku; k++) for (auto j = jl; j <= ju; j++) #pragma omp simd for (auto i = il; i <= iu; i++) function(k, j, i); - Kokkos::Profiling::popRegion(); } // 4D loop using Kokkos 1D Range @@ -468,14 +468,13 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, DevExecSpace exec_space, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function) { - Kokkos::Profiling::pushRegion(name); + PARTHENON_INSTRUMENT_REGION(name) for (auto n = nl; n <= nu; n++) for (auto k = kl; k <= ku; k++) for (auto j = jl; j <= ju; j++) #pragma omp simd for (auto i = il; i <= iu; i++) function(n, k, j, i); - Kokkos::Profiling::popRegion(); } // 5D loop using MDRange loops @@ -536,7 +535,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function) { - Kokkos::Profiling::pushRegion(name); + PARTHENON_INSTRUMENT_REGION(name) for (auto b = bl; b <= bu; b++) for (auto n = nl; n <= nu; n++) for (auto k = kl; k <= ku; k++) @@ -544,7 +543,6 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, #pragma omp simd for (auto i = il; i <= iu; i++) function(b, n, k, j, i); - Kokkos::Profiling::popRegion(); } // 6D loop using MDRange loops @@ -609,7 +607,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, const int ml, const int mu, const int nl, const int nu, const int kl, const int ku, const int jl, const int ju, const int il, const int iu, const Function &function) { - Kokkos::Profiling::pushRegion(name); + PARTHENON_INSTRUMENT_REGION(name) for (auto l = ll; l <= lu; l++) for (auto m = ml; m <= mu; m++) for (auto n = nl; n <= nu; n++) @@ -618,7 +616,6 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name, #pragma omp simd for (auto i = il; i <= iu; i++) function(l, m, n, k, j, i); - Kokkos::Profiling::popRegion(); } template diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp index 32ee6d398d37..4018d8220057 100644 --- a/src/mesh/amr_loadbalance.cpp +++ b/src/mesh/amr_loadbalance.cpp @@ -317,7 +317,7 @@ bool TryRecvSameToSame(int lid_recv, int send_rank, Variable *var, MeshBlo void Mesh::LoadBalancingAndAdaptiveMeshRefinement(ParameterInput *pin, ApplicationInput *app_in) { - Kokkos::Profiling::pushRegion("LoadBalancingAndAdaptiveMeshRefinement"); + PARTHENON_INSTRUMENT int nnew = 0, ndel = 0; if (adaptive) { @@ -342,7 +342,6 @@ void Mesh::LoadBalancingAndAdaptiveMeshRefinement(ParameterInput *pin, } lb_flag_ = false; } - Kokkos::Profiling::popRegion(); // LoadBalancingAndAdaptiveMeshRefinement } // Private routines @@ -405,7 +404,7 @@ void UpdateBlockList(std::vector const &ranklist, std::vector &nslist, void Mesh::CalculateLoadBalance(std::vector const &costlist, std::vector &ranklist, std::vector &nslist, std::vector &nblist) { - Kokkos::Profiling::pushRegion("CalculateLoadBalance"); + PARTHENON_INSTRUMENT auto const total_blocks = costlist.size(); using it = std::vector::const_iterator; @@ -452,7 +451,6 @@ void Mesh::CalculateLoadBalance(std::vector const &costlist, << std::endl; } } - Kokkos::Profiling::popRegion(); // CalculateLoadBalance } //---------------------------------------------------------------------------------------- @@ -492,7 +490,7 @@ void Mesh::UpdateCostList() { // \brief collect refinement flags and manipulate the MeshBlockTree void Mesh::UpdateMeshBlockTree(int &nnew, int &ndel) { - Kokkos::Profiling::pushRegion("UpdateMeshBlockTree"); + PARTHENON_INSTRUMENT // compute nleaf= number of leaf MeshBlocks per refined block int nleaf = 2; if (!mesh_size.symmetry(X2DIR)) nleaf = 4; @@ -520,7 +518,6 @@ void Mesh::UpdateMeshBlockTree(int &nnew, int &ndel) { tnderef += nderef[n]; } if (tnref == 0 && tnderef < nleaf) { // nothing to do - Kokkos::Profiling::popRegion(); // UpdateMeshBlockTree return; } @@ -622,8 +619,6 @@ void Mesh::UpdateMeshBlockTree(int &nnew, int &ndel) { bt->Derefine(ndel); } if (tnderef >= nleaf) delete[] clderef; - - Kokkos::Profiling::popRegion(); // UpdateMeshBlockTree } //---------------------------------------------------------------------------------------- @@ -664,7 +659,7 @@ bool Mesh::GatherCostListAndCheckBalance() { void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput *app_in, int ntot) { - Kokkos::Profiling::pushRegion("RedistributeAndRefineMeshBlocks"); + PARTHENON_INSTRUMENT // kill any cached packs mesh_data.PurgeNonBase(); mesh_data.Get()->ClearCaches(); @@ -675,58 +670,57 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput if (!mesh_size.symmetry(X3DIR)) nleaf = 8; // construct new lists - Kokkos::Profiling::pushRegion("Construct new list"); std::vector newloc(ntot); std::vector newrank(ntot); std::vector newcost(ntot); std::vector newtoold(ntot); std::vector oldtonew(nbtotal); - int nbtold = nbtotal; - tree.GetMeshBlockList(newloc.data(), newtoold.data(), nbtotal); - - // create a list mapping the previous gid to the current one - oldtonew[0] = 0; - int mb_idx = 1; - for (int n = 1; n < ntot; n++) { - if (newtoold[n] == newtoold[n - 1] + 1) { // normal - oldtonew[mb_idx++] = n; - } else if (newtoold[n] == newtoold[n - 1] + nleaf) { // derefined - for (int j = 0; j < nleaf - 1; j++) - oldtonew[mb_idx++] = n - 1; - oldtonew[mb_idx++] = n; - } - } - // fill the last block - for (; mb_idx < nbtold; mb_idx++) - oldtonew[mb_idx] = ntot - 1; - - current_level = 0; std::unordered_set newly_refined; - for (int n = 0; n < ntot; n++) { - // "on" = "old n" = "old gid" = "old global MeshBlock ID" - int on = newtoold[n]; - if (newloc[n].level() > current_level) // set the current max level - current_level = newloc[n].level(); - if (newloc[n].level() >= loclist[on].level()) { // same or refined - newcost[n] = costlist[on]; - // Keep a list of all blocks refined for below - if (newloc[n].level() > loclist[on].level()) { - newly_refined.insert(newloc[n]); - } - } else { - double acost = 0.0; - for (int l = 0; l < nleaf; l++) - acost += costlist[on + l]; - newcost[n] = acost / nleaf; - } - } - // store old nbstart and nbend before load balancing. int onbs = nslist[Globals::my_rank]; int onbe = onbs + nblist[Globals::my_rank] - 1; - Kokkos::Profiling::popRegion(); // Construct new list + { // Construct new list region + PARTHENON_INSTRUMENT_REGION("Construct_new_list") + tree.GetMeshBlockList(newloc.data(), newtoold.data(), nbtotal); + + // create a list mapping the previous gid to the current one + oldtonew[0] = 0; + int mb_idx = 1; + for (int n = 1; n < ntot; n++) { + if (newtoold[n] == newtoold[n - 1] + 1) { // normal + oldtonew[mb_idx++] = n; + } else if (newtoold[n] == newtoold[n - 1] + nleaf) { // derefined + for (int j = 0; j < nleaf - 1; j++) + oldtonew[mb_idx++] = n - 1; + oldtonew[mb_idx++] = n; + } + } + // fill the last block + for (; mb_idx < nbtold; mb_idx++) + oldtonew[mb_idx] = ntot - 1; + + current_level = 0; + for (int n = 0; n < ntot; n++) { + // "on" = "old n" = "old gid" = "old global MeshBlock ID" + int on = newtoold[n]; + if (newloc[n].level() > current_level) // set the current max level + current_level = newloc[n].level(); + if (newloc[n].level() >= loclist[on].level()) { // same or refined + newcost[n] = costlist[on]; + // Keep a list of all blocks refined for below + if (newloc[n].level() > loclist[on].level()) { + newly_refined.insert(newloc[n]); + } + } else { + double acost = 0.0; + for (int l = 0; l < nleaf; l++) + acost += costlist[on + l]; + newcost[n] = acost / nleaf; + } + } + } // Construct new list region // Calculate new load balance CalculateLoadBalance(newcost, newrank, nslist, nblist); @@ -763,64 +757,67 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput #ifdef MPI_PARALLEL // Send data from old to new blocks - Kokkos::Profiling::pushRegion("AMR: Send"); std::vector send_reqs; - for (int n = onbs; n <= onbe; n++) { - int nn = oldtonew[n]; - LogicalLocation &oloc = loclist[n]; - LogicalLocation &nloc = newloc[nn]; - auto pb = FindMeshBlock(n); - if (nloc.level() == oloc.level() && - newrank[nn] != Globals::my_rank) { // same level, different rank - for (auto &var : pb->vars_cc_) - send_reqs.emplace_back(SendSameToSame(nn - nslist[newrank[nn]], newrank[nn], - var.get(), pb.get(), this)); - } else if (nloc.level() > oloc.level()) { // c2f - // c2f must communicate to multiple leaf blocks (unlike f2c, same2same) - for (int l = 0; l < nleaf; l++) { - const int nl = nn + l; // Leaf block index in new global block list - LogicalLocation &nloc = newloc[nl]; + { // AMR Send region + PARTHENON_INSTRUMENT_REGION("AMR_Send") + for (int n = onbs; n <= onbe; n++) { + int nn = oldtonew[n]; + LogicalLocation &oloc = loclist[n]; + LogicalLocation &nloc = newloc[nn]; + auto pb = FindMeshBlock(n); + if (nloc.level() == oloc.level() && + newrank[nn] != Globals::my_rank) { // same level, different rank for (auto &var : pb->vars_cc_) - send_reqs.emplace_back(SendCoarseToFine(nl - nslist[newrank[nl]], newrank[nl], - nloc, var.get(), this)); - } // end loop over nleaf (unique to c2f branch in this step 6) - } else if (nloc.level() < oloc.level()) { // f2c: restrict + pack + send - for (auto &var : pb->vars_cc_) - send_reqs.emplace_back(SendFineToCoarse(nn - nslist[newrank[nn]], newrank[nn], - oloc, var.get(), this)); + send_reqs.emplace_back(SendSameToSame(nn - nslist[newrank[nn]], newrank[nn], + var.get(), pb.get(), this)); + } else if (nloc.level() > oloc.level()) { // c2f + // c2f must communicate to multiple leaf blocks (unlike f2c, same2same) + for (int l = 0; l < nleaf; l++) { + const int nl = nn + l; // Leaf block index in new global block list + LogicalLocation &nloc = newloc[nl]; + for (auto &var : pb->vars_cc_) + send_reqs.emplace_back(SendCoarseToFine(nl - nslist[newrank[nl]], newrank[nl], + nloc, var.get(), this)); + } // end loop over nleaf (unique to c2f branch in this step 6) + } else if (nloc.level() < oloc.level()) { // f2c: restrict + pack + send + for (auto &var : pb->vars_cc_) + send_reqs.emplace_back(SendFineToCoarse(nn - nslist[newrank[nn]], newrank[nn], + oloc, var.get(), this)); + } } - } - Kokkos::Profiling::popRegion(); // AMR: Send -#endif // MPI_PARALLEL + } // AMR Send region +#endif // MPI_PARALLEL // Construct a new MeshBlock list (moving the data within the MPI rank) - Kokkos::Profiling::pushRegion("AMR: Construct new MeshBlockList"); - RegionSize block_size = GetBlockSize(); - BlockList_t new_block_list(nbe - nbs + 1); - for (int n = nbs; n <= nbe; n++) { - int on = newtoold[n]; - if ((ranklist[on] == Globals::my_rank) && - (loclist[on].level() == newloc[n].level())) { - // on the same MPI rank and same level -> just move it - new_block_list[n - nbs] = FindMeshBlock(on); - if (!new_block_list[n - nbs]) { + { // AMR Construct new MeshBlockList region + PARTHENON_INSTRUMENT_REGION("AMR_Construct_new_list") + RegionSize block_size = GetBlockSize(); + + for (int n = nbs; n <= nbe; n++) { + int on = newtoold[n]; + if ((ranklist[on] == Globals::my_rank) && + (loclist[on].level() == newloc[n].level())) { + // on the same MPI rank and same level -> just move it + new_block_list[n - nbs] = FindMeshBlock(on); + if (!new_block_list[n - nbs]) { + BoundaryFlag block_bcs[6]; + SetBlockSizeAndBoundaries(newloc[n], block_size, block_bcs); + new_block_list[n - nbs] = + MeshBlock::Make(n, n - nbs, newloc[n], block_size, block_bcs, this, pin, + app_in, packages, resolved_packages, gflag); + } + } else { + // on a different refinement level or MPI rank - create a new block BoundaryFlag block_bcs[6]; SetBlockSizeAndBoundaries(newloc[n], block_size, block_bcs); + // append new block to list of MeshBlocks new_block_list[n - nbs] = MeshBlock::Make(n, n - nbs, newloc[n], block_size, block_bcs, this, pin, app_in, packages, resolved_packages, gflag); } - } else { - // on a different refinement level or MPI rank - create a new block - BoundaryFlag block_bcs[6]; - SetBlockSizeAndBoundaries(newloc[n], block_size, block_bcs); - // append new block to list of MeshBlocks - new_block_list[n - nbs] = - MeshBlock::Make(n, n - nbs, newloc[n], block_size, block_bcs, this, pin, app_in, - packages, resolved_packages, gflag); } - } + } // AMR Construct new MeshBlockList region // Replace the MeshBlock list auto old_block_list = std::move(block_list); @@ -832,142 +829,141 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput block_list[n - nbs]->lid = n - nbs; } - Kokkos::Profiling::popRegion(); // AMR: Construct new MeshBlockList - // Receive the data and load into MeshBlocks - Kokkos::Profiling::pushRegion("AMR: Recv data and unpack"); - bool all_received; - int niter = 0; - if (block_list.size() > 0) { - // Create a vector for holding the status of all communications, it is sized to fit - // the maximal number of calculations that this rank could receive: the number of - // blocks on the rank x the number of variables x times the number of fine blocks - // that would communicate if every block had been coarsened (8 in 3D) - std::vector finished( - std::max((nbe - nbs + 1), 1) * FindMeshBlock(nbs)->vars_cc_.size() * 8, false); - do { - all_received = true; - niter++; - int idx = 0; - for (int n = nbs; n <= nbe; n++) { - int on = newtoold[n]; - LogicalLocation &oloc = loclist[on]; - LogicalLocation &nloc = newloc[n]; - auto pb = FindMeshBlock(n); - if (oloc.level() == nloc.level() && - ranklist[on] != Globals::my_rank) { // same level, different rank + { // AMR Recv and unpack data + PARTHENON_INSTRUMENT_REGION("AMR_Recv_unpack") + bool all_received; + int niter = 0; + if (block_list.size() > 0) { + // Create a vector for holding the status of all communications, it is sized to fit + // the maximal number of calculations that this rank could receive: the number of + // blocks on the rank x the number of variables x times the number of fine blocks + // that would communicate if every block had been coarsened (8 in 3D) + std::vector finished( + std::max((nbe - nbs + 1), 1) * FindMeshBlock(nbs)->vars_cc_.size() * 8, false); + do { + all_received = true; + niter++; + int idx = 0; + for (int n = nbs; n <= nbe; n++) { + int on = newtoold[n]; + LogicalLocation &oloc = loclist[on]; + LogicalLocation &nloc = newloc[n]; + auto pb = FindMeshBlock(n); + if (oloc.level() == nloc.level() && + ranklist[on] != Globals::my_rank) { // same level, different rank #ifdef MPI_PARALLEL - for (auto &var : pb->vars_cc_) { - if (!finished[idx]) - finished[idx] = - TryRecvSameToSame(n - nbs, ranklist[on], var.get(), pb.get(), this); - all_received = finished[idx++] && all_received; - } + for (auto &var : pb->vars_cc_) { + if (!finished[idx]) + finished[idx] = + TryRecvSameToSame(n - nbs, ranklist[on], var.get(), pb.get(), this); + all_received = finished[idx++] && all_received; + } #endif - } else if (oloc.level() > nloc.level()) { // f2c - for (int l = 0; l < nleaf; l++) { - auto pob = pb; - if (ranklist[on + l] == Globals::my_rank) pob = old_block_list[on + l - onbs]; - LogicalLocation &oloc = loclist[on + l]; + } else if (oloc.level() > nloc.level()) { // f2c + for (int l = 0; l < nleaf; l++) { + auto pob = pb; + if (ranklist[on + l] == Globals::my_rank) + pob = old_block_list[on + l - onbs]; + LogicalLocation &oloc = loclist[on + l]; + for (auto &var : pb->vars_cc_) { + if (!finished[idx]) { + auto var_in = pob->meshblock_data.Get()->GetVarPtr(var->label()); + finished[idx] = + TryRecvFineToCoarse(n - nbs, ranklist[on + l], oloc, var_in.get(), + var.get(), pb.get(), this); + } + all_received = finished[idx++] && all_received; + } + } + } else if (oloc.level() < nloc.level()) { // c2f for (auto &var : pb->vars_cc_) { if (!finished[idx]) { + auto pob = pb; + if (ranklist[on] == Globals::my_rank) pob = old_block_list[on - onbs]; auto var_in = pob->meshblock_data.Get()->GetVarPtr(var->label()); - finished[idx] = - TryRecvFineToCoarse(n - nbs, ranklist[on + l], oloc, var_in.get(), - var.get(), pb.get(), this); + finished[idx] = TryRecvCoarseToFine( + n - nbs, ranklist[on], nloc, var_in.get(), var.get(), pb.get(), this); } all_received = finished[idx++] && all_received; } } - } else if (oloc.level() < nloc.level()) { // c2f - for (auto &var : pb->vars_cc_) { - if (!finished[idx]) { - auto pob = pb; - if (ranklist[on] == Globals::my_rank) pob = old_block_list[on - onbs]; - auto var_in = pob->meshblock_data.Get()->GetVarPtr(var->label()); - finished[idx] = TryRecvCoarseToFine( - n - nbs, ranklist[on], nloc, var_in.get(), var.get(), pb.get(), this); - } - all_received = finished[idx++] && all_received; - } } - } - // rb_idx is a running index, so we repeat the loop until all vals are true - } while (!all_received && niter < 1e7); - if (!all_received) PARTHENON_FAIL("AMR Receive failed"); - } - // Fence here to be careful that all communication is finished before moving - // on to prolongation - Kokkos::fence(); - - // Prolongate blocks that had a coarse buffer filled (i.e. c2f blocks) - ProResCache_t prolongation_cache; - int nprolong = 0; - for (int nn = nbs; nn <= nbe; nn++) { - int on = newtoold[nn]; - auto pmb = FindMeshBlock(nn); - if (newloc[nn].level() > loclist[on].level()) nprolong += pmb->vars_cc_.size(); - } - prolongation_cache.Initialize(nprolong, resolved_packages.get()); - int iprolong = 0; - for (int nn = nbs; nn <= nbe; nn++) { - int on = newtoold[nn]; - if (newloc[nn].level() > loclist[on].level()) { + // rb_idx is a running index, so we repeat the loop until all vals are true + } while (!all_received && niter < 1e7); + if (!all_received) PARTHENON_FAIL("AMR Receive failed"); + } + // Fence here to be careful that all communication is finished before moving + // on to prolongation + Kokkos::fence(); + + // Prolongate blocks that had a coarse buffer filled (i.e. c2f blocks) + ProResCache_t prolongation_cache; + int nprolong = 0; + for (int nn = nbs; nn <= nbe; nn++) { + int on = newtoold[nn]; auto pmb = FindMeshBlock(nn); - for (auto &var : pmb->vars_cc_) { - prolongation_cache.RegisterRegionHost(iprolong++, - ProResInfo::GetInteriorProlongate(pmb, var), - var.get(), resolved_packages.get()); + if (newloc[nn].level() > loclist[on].level()) nprolong += pmb->vars_cc_.size(); + } + prolongation_cache.Initialize(nprolong, resolved_packages.get()); + int iprolong = 0; + for (int nn = nbs; nn <= nbe; nn++) { + int on = newtoold[nn]; + if (newloc[nn].level() > loclist[on].level()) { + auto pmb = FindMeshBlock(nn); + for (auto &var : pmb->vars_cc_) { + prolongation_cache.RegisterRegionHost( + iprolong++, ProResInfo::GetInteriorProlongate(pmb, var), var.get(), + resolved_packages.get()); + } } } - } - prolongation_cache.CopyToDevice(); - - refinement::ProlongateShared(resolved_packages.get(), prolongation_cache, - block_list[0]->cellbounds, block_list[0]->c_cellbounds); - - // update the lists - loclist = std::move(newloc); - ranklist = std::move(newrank); - costlist = std::move(newcost); - - // A block newly refined and prolongated may have neighbors which were - // already refined to the new level. - // If so, the prolongated versions of shared elements will not reflect - // the true, finer versions present in the neighbor block. - // We must create any new fine buffers and fill them from these neighbors - // in order to maintain a consistent global state. - // Thus we rebuild and synchronize the mesh now, but using a unique - // neighbor precedence favoring the "old" fine blocks over "new" ones - for (auto &pmb : block_list) { - pmb->pbval->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data(), - newly_refined); - } - // Make sure all old sends/receives are done before we reconfigure the mesh -#ifdef MPI_PARALLEL - if (send_reqs.size() != 0) - PARTHENON_MPI_CHECK( - MPI_Waitall(send_reqs.size(), send_reqs.data(), MPI_STATUSES_IGNORE)); -#endif - // Re-initialize the mesh with our temporary ownership/neighbor configurations. - // No buffers are different when we switch to the final precedence order. - Initialize(false, pin, app_in); + prolongation_cache.CopyToDevice(); - // Internal refinement relies on the fine shared values, which are only consistent after - // being updated with any previously fine versions - refinement::ProlongateInternal(resolved_packages.get(), prolongation_cache, + refinement::ProlongateShared(resolved_packages.get(), prolongation_cache, block_list[0]->cellbounds, block_list[0]->c_cellbounds); - // Rebuild just the ownership model, this time weighting the "new" fine blocks just like - // any other blocks at their level. - for (auto &pmb : block_list) { - pmb->pbval->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data()); - } + // update the lists + loclist = std::move(newloc); + ranklist = std::move(newrank); + costlist = std::move(newcost); + + // A block newly refined and prolongated may have neighbors which were + // already refined to the new level. + // If so, the prolongated versions of shared elements will not reflect + // the true, finer versions present in the neighbor block. + // We must create any new fine buffers and fill them from these neighbors + // in order to maintain a consistent global state. + // Thus we rebuild and synchronize the mesh now, but using a unique + // neighbor precedence favoring the "old" fine blocks over "new" ones + for (auto &pmb : block_list) { + pmb->pbval->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data(), + newly_refined); + } + // Make sure all old sends/receives are done before we reconfigure the mesh +#ifdef MPI_PARALLEL + if (send_reqs.size() != 0) + PARTHENON_MPI_CHECK( + MPI_Waitall(send_reqs.size(), send_reqs.data(), MPI_STATUSES_IGNORE)); +#endif + // Re-initialize the mesh with our temporary ownership/neighbor configurations. + // No buffers are different when we switch to the final precedence order. + Initialize(false, pin, app_in); + + // Internal refinement relies on the fine shared values, which are only consistent + // after being updated with any previously fine versions + refinement::ProlongateInternal(resolved_packages.get(), prolongation_cache, + block_list[0]->cellbounds, + block_list[0]->c_cellbounds); + + // Rebuild just the ownership model, this time weighting the "new" fine blocks just + // like any other blocks at their level. + for (auto &pmb : block_list) { + pmb->pbval->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data()); + } - Kokkos::Profiling::popRegion(); // AMR: Recv data and unpack + } // AMR Recv and unpack data ResetLoadBalanceVariables(); - - Kokkos::Profiling::popRegion(); // RedistributeAndRefineMeshBlocks } } // namespace parthenon diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp index ed81e3e4e219..a6651154ae3a 100644 --- a/src/mesh/mesh.cpp +++ b/src/mesh/mesh.cpp @@ -987,7 +987,7 @@ void Mesh::ApplyUserWorkBeforeOutput(ParameterInput *pin) { // \brief initialization before the main loop as well as during remeshing void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *app_in) { - Kokkos::Profiling::pushRegion("Mesh::Initialize"); + PARTHENON_INSTRUMENT bool init_done = true; const int nb_initial = nbtotal; do { @@ -1174,8 +1174,6 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput * // Initialize the "base" MeshData object mesh_data.Get()->Set(block_list); - - Kokkos::Profiling::popRegion(); // Mesh::Initialize } /// Finds location of a block with ID `tgid`. diff --git a/src/outputs/outputs.cpp b/src/outputs/outputs.cpp index de54d7827c59..14eac0dd20fa 100644 --- a/src/outputs/outputs.cpp +++ b/src/outputs/outputs.cpp @@ -402,7 +402,7 @@ void OutputType::ClearOutputData() { void Outputs::MakeOutputs(Mesh *pm, ParameterInput *pin, SimTime *tm, const SignalHandler::OutputSignal signal) { - Kokkos::Profiling::pushRegion("MakeOutputs"); + PARTHENON_INSTRUMENT bool first = true; OutputType *ptype = pfirst_type_; while (ptype != nullptr) { @@ -418,7 +418,6 @@ void Outputs::MakeOutputs(Mesh *pm, ParameterInput *pin, SimTime *tm, } ptype = ptype->pnext_type; // move to next OutputType node in singly linked list } - Kokkos::Profiling::popRegion(); // MakeOutputs } } // namespace parthenon diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp index 95b89b93b627..866d4096015d 100644 --- a/src/outputs/parthenon_hdf5.cpp +++ b/src/outputs/parthenon_hdf5.cpp @@ -63,12 +63,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm const SignalHandler::OutputSignal signal) { using namespace HDF5; using namespace OutputUtils; - - if constexpr (WRITE_SINGLE_PRECISION) { - Kokkos::Profiling::pushRegion("PHDF5::WriteOutputFileSinglePrec"); - } else { - Kokkos::Profiling::pushRegion("PHDF5::WriteOutputFileRealPrec"); - } + PARTHENON_INSTRUMENT // writes all graphics variables to hdf file // HDF5 structures @@ -117,95 +112,94 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // -------------------------------------------------------------------------------- // // WRITING ATTRIBUTES // // -------------------------------------------------------------------------------- // - Kokkos::Profiling::pushRegion("write Attributes"); - { - Kokkos::Profiling::pushRegion("write input"); - // write input key-value pairs - std::ostringstream oss; - pin->ParameterDump(oss); - - // Mesh information - const H5G input_group = MakeGroup(file, "/Input"); - - HDF5WriteAttribute("File", oss.str().c_str(), input_group); - Kokkos::Profiling::popRegion(); // write input - } // Input section - - // we'll need this again at the end - const H5G info_group = MakeGroup(file, "/Info"); - { - Kokkos::Profiling::pushRegion("write Info"); - HDF5WriteAttribute("OutputFormatVersion", OUTPUT_VERSION_FORMAT, info_group); + H5G info_group; + { // write attributes + PARTHENON_INSTRUMENT_REGION("write_attributes") { + PARTHENON_INSTRUMENT_REGION("write_input") + // write input key-value pairs + std::ostringstream oss; + pin->ParameterDump(oss); + + // Mesh information + const H5G input_group = MakeGroup(file, "/Input"); + + HDF5WriteAttribute("File", oss.str().c_str(), input_group); + } // Input section + + // we'll need this again at the end + info_group = MakeGroup(file, "/Info"); + { + PARTHENON_INSTRUMENT_REGION("write_info") + HDF5WriteAttribute("OutputFormatVersion", OUTPUT_VERSION_FORMAT, info_group); - if (tm != nullptr) { - HDF5WriteAttribute("NCycle", tm->ncycle, info_group); - HDF5WriteAttribute("Time", tm->time, info_group); - HDF5WriteAttribute("dt", tm->dt, info_group); - } + if (tm != nullptr) { + HDF5WriteAttribute("NCycle", tm->ncycle, info_group); + HDF5WriteAttribute("Time", tm->time, info_group); + HDF5WriteAttribute("dt", tm->dt, info_group); + } - HDF5WriteAttribute("WallTime", Driver::elapsed_main(), info_group); - HDF5WriteAttribute("NumDims", pm->ndim, info_group); - HDF5WriteAttribute("NumMeshBlocks", pm->nbtotal, info_group); - HDF5WriteAttribute("MaxLevel", max_level, info_group); - // write whether we include ghost cells or not - HDF5WriteAttribute("IncludesGhost", output_params.include_ghost_zones ? 1 : 0, - info_group); - // write number of ghost cells in simulation - HDF5WriteAttribute("NGhost", Globals::nghost, info_group); - HDF5WriteAttribute("Coordinates", std::string(first_block.coords.Name()).c_str(), - info_group); - - // restart info, write always - HDF5WriteAttribute("NBNew", pm->nbnew, info_group); - HDF5WriteAttribute("NBDel", pm->nbdel, info_group); - HDF5WriteAttribute("RootLevel", rootLevel, info_group); - HDF5WriteAttribute("Refine", pm->adaptive ? 1 : 0, info_group); - HDF5WriteAttribute("Multilevel", pm->multilevel ? 1 : 0, info_group); - - HDF5WriteAttribute("BlocksPerPE", nblist, info_group); - - // Mesh block size - HDF5WriteAttribute("MeshBlockSize", std::vector{nx1, nx2, nx3}, info_group); - - // RootGridDomain - float[9] array with xyz mins, maxs, rats (dx(i)/dx(i-1)) - HDF5WriteAttribute( - "RootGridDomain", - std::vector{pm->mesh_size.xmin(X1DIR), pm->mesh_size.xmax(X1DIR), - pm->mesh_size.xrat(X1DIR), pm->mesh_size.xmin(X2DIR), - pm->mesh_size.xmax(X2DIR), pm->mesh_size.xrat(X2DIR), - pm->mesh_size.xmin(X3DIR), pm->mesh_size.xmax(X3DIR), - pm->mesh_size.xrat(X3DIR)}, - info_group); - - // Root grid size (number of cells at root level) - HDF5WriteAttribute("RootGridSize", - std::vector{pm->mesh_size.nx(X1DIR), pm->mesh_size.nx(X2DIR), - pm->mesh_size.nx(X3DIR)}, - info_group); - - // Boundary conditions - std::vector boundary_condition_str(BOUNDARY_NFACES); - for (size_t i = 0; i < boundary_condition_str.size(); i++) { - boundary_condition_str[i] = GetBoundaryString(pm->mesh_bcs[i]); - } + HDF5WriteAttribute("WallTime", Driver::elapsed_main(), info_group); + HDF5WriteAttribute("NumDims", pm->ndim, info_group); + HDF5WriteAttribute("NumMeshBlocks", pm->nbtotal, info_group); + HDF5WriteAttribute("MaxLevel", max_level, info_group); + // write whether we include ghost cells or not + HDF5WriteAttribute("IncludesGhost", output_params.include_ghost_zones ? 1 : 0, + info_group); + // write number of ghost cells in simulation + HDF5WriteAttribute("NGhost", Globals::nghost, info_group); + HDF5WriteAttribute("Coordinates", std::string(first_block.coords.Name()).c_str(), + info_group); + + // restart info, write always + HDF5WriteAttribute("NBNew", pm->nbnew, info_group); + HDF5WriteAttribute("NBDel", pm->nbdel, info_group); + HDF5WriteAttribute("RootLevel", rootLevel, info_group); + HDF5WriteAttribute("Refine", pm->adaptive ? 1 : 0, info_group); + HDF5WriteAttribute("Multilevel", pm->multilevel ? 1 : 0, info_group); + + HDF5WriteAttribute("BlocksPerPE", nblist, info_group); + + // Mesh block size + HDF5WriteAttribute("MeshBlockSize", std::vector{nx1, nx2, nx3}, info_group); + + // RootGridDomain - float[9] array with xyz mins, maxs, rats (dx(i)/dx(i-1)) + HDF5WriteAttribute( + "RootGridDomain", + std::vector{pm->mesh_size.xmin(X1DIR), pm->mesh_size.xmax(X1DIR), + pm->mesh_size.xrat(X1DIR), pm->mesh_size.xmin(X2DIR), + pm->mesh_size.xmax(X2DIR), pm->mesh_size.xrat(X2DIR), + pm->mesh_size.xmin(X3DIR), pm->mesh_size.xmax(X3DIR), + pm->mesh_size.xrat(X3DIR)}, + info_group); + + // Root grid size (number of cells at root level) + HDF5WriteAttribute("RootGridSize", + std::vector{pm->mesh_size.nx(X1DIR), + pm->mesh_size.nx(X2DIR), + pm->mesh_size.nx(X3DIR)}, + info_group); + + // Boundary conditions + std::vector boundary_condition_str(BOUNDARY_NFACES); + for (size_t i = 0; i < boundary_condition_str.size(); i++) { + boundary_condition_str[i] = GetBoundaryString(pm->mesh_bcs[i]); + } - HDF5WriteAttribute("BoundaryConditions", boundary_condition_str, info_group); - Kokkos::Profiling::popRegion(); // write Info - } // Info section + HDF5WriteAttribute("BoundaryConditions", boundary_condition_str, info_group); + } // Info section - // write Params - { - Kokkos::Profiling::pushRegion("behold: write Params"); - const H5G params_group = MakeGroup(file, "/Params"); + // write Params + { + PARTHENON_INSTRUMENT_REGION("write_params") + const H5G params_group = MakeGroup(file, "/Params"); - for (const auto &package : pm->packages.AllPackages()) { - const auto state = package.second; - // Write all params that can be written as HDF5 attributes - state->AllParams().WriteAllToHDF5(state->label(), params_group); - } - Kokkos::Profiling::popRegion(); // behold: write Params - } // Params section - Kokkos::Profiling::popRegion(); // write Attributes + for (const auto &package : pm->packages.AllPackages()) { + const auto state = package.second; + // Write all params that can be written as HDF5 attributes + state->AllParams().WriteAllToHDF5(state->label(), params_group); + } + } // Params section + } // write attributes // -------------------------------------------------------------------------------- // // WRITING MESHBLOCK METADATA // @@ -242,7 +236,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // write Blocks metadata { - Kokkos::Profiling::pushRegion("write block metadata"); + PARTHENON_INSTRUMENT_REGION("write_block_metadata") const H5G gBlocks = MakeGroup(file, "/Blocks"); // write Xmin[ndim] for blocks @@ -272,39 +266,39 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm HDF5Write2D(gBlocks, "loc.level-gid-lid-cnghost-gflag", tmpID.data(), p_loc_offset, p_loc_cnt, p_glob_cnt, pl_xfer); } - Kokkos::Profiling::popRegion(); // write block metadata - } // Block section + } // Block section // Write mesh coordinates to file - Kokkos::Profiling::pushRegion("write mesh coords"); - for (const bool face : {true, false}) { - const H5G gLocations = MakeGroup(file, face ? "/Locations" : "/VolumeLocations"); + { // write mesh coords + PARTHENON_INSTRUMENT_REGION("write_mesh_coords"); + for (const bool face : {true, false}) { + const H5G gLocations = MakeGroup(file, face ? "/Locations" : "/VolumeLocations"); - // write X coordinates - std::vector loc_x((nx1 + face) * num_blocks_local); - std::vector loc_y((nx2 + face) * num_blocks_local); - std::vector loc_z((nx3 + face) * num_blocks_local); + // write X coordinates + std::vector loc_x((nx1 + face) * num_blocks_local); + std::vector loc_y((nx2 + face) * num_blocks_local); + std::vector loc_z((nx3 + face) * num_blocks_local); - ComputeCoords_(pm, face, out_ib, out_jb, out_kb, loc_x, loc_y, loc_z); + ComputeCoords_(pm, face, out_ib, out_jb, out_kb, loc_x, loc_y, loc_z); - local_count[1] = global_count[1] = nx1 + face; - HDF5Write2D(gLocations, "x", loc_x.data(), p_loc_offset, p_loc_cnt, p_glob_cnt, - pl_xfer); + local_count[1] = global_count[1] = nx1 + face; + HDF5Write2D(gLocations, "x", loc_x.data(), p_loc_offset, p_loc_cnt, p_glob_cnt, + pl_xfer); - local_count[1] = global_count[1] = nx2 + face; - HDF5Write2D(gLocations, "y", loc_y.data(), p_loc_offset, p_loc_cnt, p_glob_cnt, - pl_xfer); + local_count[1] = global_count[1] = nx2 + face; + HDF5Write2D(gLocations, "y", loc_y.data(), p_loc_offset, p_loc_cnt, p_glob_cnt, + pl_xfer); - local_count[1] = global_count[1] = nx3 + face; - HDF5Write2D(gLocations, "z", loc_z.data(), p_loc_offset, p_loc_cnt, p_glob_cnt, - pl_xfer); - } - Kokkos::Profiling::popRegion(); // write mesh coords + local_count[1] = global_count[1] = nx3 + face; + HDF5Write2D(gLocations, "z", loc_z.data(), p_loc_offset, p_loc_cnt, p_glob_cnt, + pl_xfer); + } + } // write mesh coords // Write Levels and Logical Locations with the level for each Meshblock loclist contains // levels and logical locations for all meshblocks on all ranks { - Kokkos::Profiling::pushRegion("write levels and locations"); + PARTHENON_INSTRUMENT_REGION("write_levels_and_locations") const auto &loclist = pm->GetLocList(); std::vector levels; @@ -331,236 +325,241 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // reset for collective output local_count[0] = num_blocks_local; - Kokkos::Profiling::popRegion(); // write levels and locations } // -------------------------------------------------------------------------------- // // WRITING VARIABLES DATA // // -------------------------------------------------------------------------------- // - Kokkos::Profiling::pushRegion("write all variable data"); - - // All blocks have the same list of variable metadata that exist in the entire - // simulation, but not all variables may be allocated on all blocks - - auto get_vars = [=](const std::shared_ptr pmb) { - auto &var_vec = pmb->meshblock_data.Get()->GetVariableVector(); - if (restart_) { - // get all vars with flag Independent OR restart - return GetAnyVariables( - var_vec, {parthenon::Metadata::Independent, parthenon::Metadata::Restart}); - } else { - return GetAnyVariables(var_vec, output_params.variables); - } - }; - - // get list of all vars, just use the first block since the list is the same for all - // blocks std::vector all_vars_info; - const auto vars = get_vars(pm->block_list.front()); - for (auto &v : vars) { - all_vars_info.emplace_back(v); - } - - // sort alphabetically - std::sort(all_vars_info.begin(), all_vars_info.end(), - [](const VarInfo &a, const VarInfo &b) { return a.label < b.label; }); - - // We need to add information about the sparse variables to the HDF5 file, namely: - // 1) Which variables are sparse - // 2) Is a sparse id of a particular sparse variable allocated on a given block - // - // This information is stored in the dataset called "SparseInfo". The data set - // contains an attribute "SparseFields" that is a vector of strings with the names - // of the sparse fields (field name with sparse id, i.e. "bar_28", "bar_7", foo_1", - // "foo_145"). The field names are in alphabetical order, which is the same order - // they show up in all_unique_vars (because it's a sorted set). - // - // The dataset SparseInfo itself is a 2D array of bools. The first index is the - // global block index and the second index is the sparse field (same order as the - // SparseFields attribute). SparseInfo[b][v] is true if the sparse field with index - // v is allocated on the block with index b, otherwise the value is false - std::vector sparse_names; - std::unordered_map sparse_field_idx; - for (auto &vinfo : all_vars_info) { - if (vinfo.is_sparse) { - sparse_field_idx.insert({vinfo.label, sparse_names.size()}); - sparse_names.push_back(vinfo.label); - } - } - - hsize_t num_sparse = sparse_names.size(); - // can't use std::vector here because std::vector is the same as - // std::vector and it doesn't have .data() member - std::unique_ptr sparse_allocated(new hbool_t[num_blocks_local * num_sparse]); - - // allocate space for largest size variable - int varSize_max = 0; - for (auto &vinfo : all_vars_info) { - const int varSize = - vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * vinfo.nx3 * vinfo.nx2 * vinfo.nx1; - varSize_max = std::max(varSize_max, varSize); - } - - using OutT = typename std::conditional::type; - std::vector tmpData(varSize_max * num_blocks_local); - - // create persistent spaces - local_count[0] = num_blocks_local; - global_count[0] = max_blocks_global; - local_count[4] = global_count[4] = nx3; - local_count[5] = global_count[5] = nx2; - local_count[6] = global_count[6] = nx1; + hsize_t num_sparse; + std::unique_ptr sparse_allocated; + { // write all variable data + PARTHENON_INSTRUMENT_REGION("write_all_variable_data") + + // All blocks have the same list of variable metadata that exist in the entire + // simulation, but not all variables may be allocated on all blocks + + auto get_vars = [=](const std::shared_ptr pmb) { + auto &var_vec = pmb->meshblock_data.Get()->GetVariableVector(); + if (restart_) { + // get all vars with flag Independent OR restart + return GetAnyVariables( + var_vec, {parthenon::Metadata::Independent, parthenon::Metadata::Restart}); + } else { + return GetAnyVariables(var_vec, output_params.variables); + } + }; - // for each variable we write - for (auto &vinfo : all_vars_info) { - Kokkos::Profiling::pushRegion("write variable loop"); - // not really necessary, but doesn't hurt - memset(tmpData.data(), 0, tmpData.size() * sizeof(OutT)); + // get list of all vars, just use the first block since the list is the same for all + // blocks + const auto vars = get_vars(pm->block_list.front()); + for (auto &v : vars) { + all_vars_info.emplace_back(v); + } - const std::string var_name = vinfo.label; - const hsize_t nx6 = vinfo.nx6; - const hsize_t nx5 = vinfo.nx5; - const hsize_t nx4 = vinfo.nx4; + // sort alphabetically + std::sort(all_vars_info.begin(), all_vars_info.end(), + [](const VarInfo &a, const VarInfo &b) { return a.label < b.label; }); + + // We need to add information about the sparse variables to the HDF5 file, namely: + // 1) Which variables are sparse + // 2) Is a sparse id of a particular sparse variable allocated on a given block + // + // This information is stored in the dataset called "SparseInfo". The data set + // contains an attribute "SparseFields" that is a vector of strings with the names + // of the sparse fields (field name with sparse id, i.e. "bar_28", "bar_7", foo_1", + // "foo_145"). The field names are in alphabetical order, which is the same order + // they show up in all_unique_vars (because it's a sorted set). + // + // The dataset SparseInfo itself is a 2D array of bools. The first index is the + // global block index and the second index is the sparse field (same order as the + // SparseFields attribute). SparseInfo[b][v] is true if the sparse field with index + // v is allocated on the block with index b, otherwise the value is false + + std::unordered_map sparse_field_idx; + for (auto &vinfo : all_vars_info) { + if (vinfo.is_sparse) { + sparse_field_idx.insert({vinfo.label, sparse_names.size()}); + sparse_names.push_back(vinfo.label); + } + } - local_count[1] = global_count[1] = nx6; - local_count[2] = global_count[2] = nx5; - local_count[3] = global_count[3] = nx4; + num_sparse = sparse_names.size(); + // can't use std::vector here because std::vector is the same as + // std::vector and it doesn't have .data() member + sparse_allocated = std::make_unique(num_blocks_local * num_sparse); + + // allocate space for largest size variable + int varSize_max = 0; + for (auto &vinfo : all_vars_info) { + const int varSize = + vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * vinfo.nx3 * vinfo.nx2 * vinfo.nx1; + varSize_max = std::max(varSize_max, varSize); + } - std::vector alldims({nx6, nx5, nx4, static_cast(vinfo.nx3), - static_cast(vinfo.nx2), - static_cast(vinfo.nx1)}); + using OutT = typename std::conditional::type; + std::vector tmpData(varSize_max * num_blocks_local); - int ndim = -1; + // create persistent spaces + local_count[0] = num_blocks_local; + global_count[0] = max_blocks_global; + local_count[4] = global_count[4] = nx3; + local_count[5] = global_count[5] = nx2; + local_count[6] = global_count[6] = nx1; + + // for each variable we write + for (auto &vinfo : all_vars_info) { + PARTHENON_INSTRUMENT_REGION("write_variable_loop") + // not really necessary, but doesn't hurt + memset(tmpData.data(), 0, tmpData.size() * sizeof(OutT)); + + const std::string var_name = vinfo.label; + const hsize_t nx6 = vinfo.nx6; + const hsize_t nx5 = vinfo.nx5; + const hsize_t nx4 = vinfo.nx4; + + local_count[1] = global_count[1] = nx6; + local_count[2] = global_count[2] = nx5; + local_count[3] = global_count[3] = nx4; + + std::vector alldims({nx6, nx5, nx4, static_cast(vinfo.nx3), + static_cast(vinfo.nx2), + static_cast(vinfo.nx1)}); + + int ndim = -1; #ifndef PARTHENON_DISABLE_HDF5_COMPRESSION - // we need chunks to enable compression - std::array chunk_size({1, 1, 1, 1, 1, 1, 1}); + // we need chunks to enable compression + std::array chunk_size({1, 1, 1, 1, 1, 1, 1}); #endif - if (vinfo.where == MetadataFlag(Metadata::Cell)) { - ndim = 3 + vinfo.tensor_rank + 1; - for (int i = 0; i < vinfo.tensor_rank; i++) { - local_count[1 + i] = global_count[1 + i] = alldims[3 - vinfo.tensor_rank + i]; - } - local_count[vinfo.tensor_rank + 1] = global_count[vinfo.tensor_rank + 1] = nx3; - local_count[vinfo.tensor_rank + 2] = global_count[vinfo.tensor_rank + 2] = nx2; - local_count[vinfo.tensor_rank + 3] = global_count[vinfo.tensor_rank + 3] = nx1; + if (vinfo.where == MetadataFlag(Metadata::Cell)) { + ndim = 3 + vinfo.tensor_rank + 1; + for (int i = 0; i < vinfo.tensor_rank; i++) { + local_count[1 + i] = global_count[1 + i] = alldims[3 - vinfo.tensor_rank + i]; + } + local_count[vinfo.tensor_rank + 1] = global_count[vinfo.tensor_rank + 1] = nx3; + local_count[vinfo.tensor_rank + 2] = global_count[vinfo.tensor_rank + 2] = nx2; + local_count[vinfo.tensor_rank + 3] = global_count[vinfo.tensor_rank + 3] = nx1; #ifndef PARTHENON_DISABLE_HDF5_COMPRESSION - if (output_params.hdf5_compression_level > 0) { - for (int i = ndim - 3; i < ndim; i++) { - chunk_size[i] = local_count[i]; + if (output_params.hdf5_compression_level > 0) { + for (int i = ndim - 3; i < ndim; i++) { + chunk_size[i] = local_count[i]; + } } - } #endif - } else if (vinfo.where == MetadataFlag(Metadata::None)) { - ndim = vinfo.tensor_rank + 1; - for (int i = 0; i < vinfo.tensor_rank; i++) { - local_count[1 + i] = global_count[1 + i] = alldims[6 - vinfo.tensor_rank + i]; - } + } else if (vinfo.where == MetadataFlag(Metadata::None)) { + ndim = vinfo.tensor_rank + 1; + for (int i = 0; i < vinfo.tensor_rank; i++) { + local_count[1 + i] = global_count[1 + i] = alldims[6 - vinfo.tensor_rank + i]; + } #ifndef PARTHENON_DISABLE_HDF5_COMPRESSION - if (output_params.hdf5_compression_level > 0) { - int nchunk_indices = std::min(vinfo.tensor_rank, 3); - for (int i = ndim - nchunk_indices; i < ndim; i++) { - chunk_size[i] = alldims[6 - nchunk_indices + i]; + if (output_params.hdf5_compression_level > 0) { + int nchunk_indices = std::min(vinfo.tensor_rank, 3); + for (int i = ndim - nchunk_indices; i < ndim; i++) { + chunk_size[i] = alldims[6 - nchunk_indices + i]; + } } - } #endif - } else { - PARTHENON_THROW("Only Cell and None locations supported!"); - } + } else { + PARTHENON_THROW("Only Cell and None locations supported!"); + } #ifndef PARTHENON_DISABLE_HDF5_COMPRESSION - PARTHENON_HDF5_CHECK(H5Pset_chunk(pl_dcreate, ndim, chunk_size.data())); - // Do not run the pipeline if compression is soft disabled. - // By default data would still be passed, which may result in slower output. - if (output_params.hdf5_compression_level > 0) { - PARTHENON_HDF5_CHECK( - H5Pset_deflate(pl_dcreate, std::min(9, output_params.hdf5_compression_level))); - } + PARTHENON_HDF5_CHECK(H5Pset_chunk(pl_dcreate, ndim, chunk_size.data())); + // Do not run the pipeline if compression is soft disabled. + // By default data would still be passed, which may result in slower output. + if (output_params.hdf5_compression_level > 0) { + PARTHENON_HDF5_CHECK(H5Pset_deflate( + pl_dcreate, std::min(9, output_params.hdf5_compression_level))); + } #endif - // load up data - hsize_t index = 0; - - Kokkos::Profiling::pushRegion("fill host output buffer"); - // for each local mesh block - for (size_t b_idx = 0; b_idx < num_blocks_local; ++b_idx) { - const auto &pmb = pm->block_list[b_idx]; - bool is_allocated = false; - - // for each variable that this local meshblock actually has - const auto vars = get_vars(pmb); - for (auto &v : vars) { - // For reference, if we update the logic here, there's also - // a similar block in parthenon_manager.cpp - if (v->IsAllocated() && (var_name == v->label())) { - auto v_h = v->data.GetHostMirrorAndCopy(); - for (int t = 0; t < nx6; ++t) { - for (int u = 0; u < nx5; ++u) { - for (int v = 0; v < nx4; ++v) { - if (vinfo.where == MetadataFlag(Metadata::Cell)) { - for (int k = out_kb.s; k <= out_kb.e; ++k) { - for (int j = out_jb.s; j <= out_jb.e; ++j) { - for (int i = out_ib.s; i <= out_ib.e; ++i) { - tmpData[index++] = static_cast(v_h(t, u, v, k, j, i)); + // load up data + hsize_t index = 0; + + { // fill host output buffer + PARTHENON_INSTRUMENT_REGION("fill_host_output_buffer") + // for each local mesh block + for (size_t b_idx = 0; b_idx < num_blocks_local; ++b_idx) { + const auto &pmb = pm->block_list[b_idx]; + bool is_allocated = false; + + // for each variable that this local meshblock actually has + const auto vars = get_vars(pmb); + for (auto &v : vars) { + // For reference, if we update the logic here, there's also + // a similar block in parthenon_manager.cpp + if (v->IsAllocated() && (var_name == v->label())) { + auto v_h = v->data.GetHostMirrorAndCopy(); + for (int t = 0; t < nx6; ++t) { + for (int u = 0; u < nx5; ++u) { + for (int v = 0; v < nx4; ++v) { + if (vinfo.where == MetadataFlag(Metadata::Cell)) { + for (int k = out_kb.s; k <= out_kb.e; ++k) { + for (int j = out_jb.s; j <= out_jb.e; ++j) { + for (int i = out_ib.s; i <= out_ib.e; ++i) { + tmpData[index++] = static_cast(v_h(t, u, v, k, j, i)); + } + } } - } - } - } else { - for (int k = 0; k < vinfo.nx3; ++k) { - for (int j = 0; j < vinfo.nx2; ++j) { - for (int i = 0; i < vinfo.nx1; ++i) { - tmpData[index++] = static_cast(v_h(t, u, v, k, j, i)); + } else { + for (int k = 0; k < vinfo.nx3; ++k) { + for (int j = 0; j < vinfo.nx2; ++j) { + for (int i = 0; i < vinfo.nx1; ++i) { + tmpData[index++] = static_cast(v_h(t, u, v, k, j, i)); + } + } } } } } } + + is_allocated = true; + break; } } - is_allocated = true; - break; - } - } - - if (vinfo.is_sparse) { - size_t sparse_idx = sparse_field_idx.at(vinfo.label); - sparse_allocated[b_idx * num_sparse + sparse_idx] = is_allocated; - } + if (vinfo.is_sparse) { + size_t sparse_idx = sparse_field_idx.at(vinfo.label); + sparse_allocated[b_idx * num_sparse + sparse_idx] = is_allocated; + } - if (!is_allocated) { - if (vinfo.is_sparse) { - hsize_t varSize{}; - if (vinfo.where == MetadataFlag(Metadata::Cell)) { - varSize = vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * (out_kb.e - out_kb.s + 1) * - (out_jb.e - out_jb.s + 1) * (out_ib.e - out_ib.s + 1); - } else { - varSize = - vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * vinfo.nx3 * vinfo.nx2 * vinfo.nx1; + if (!is_allocated) { + if (vinfo.is_sparse) { + hsize_t varSize{}; + if (vinfo.where == MetadataFlag(Metadata::Cell)) { + varSize = vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * (out_kb.e - out_kb.s + 1) * + (out_jb.e - out_jb.s + 1) * (out_ib.e - out_ib.s + 1); + } else { + varSize = + vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * vinfo.nx3 * vinfo.nx2 * vinfo.nx1; + } + auto fill_val = output_params.sparse_seed_nans + ? std::numeric_limits::quiet_NaN() + : 0; + std::fill(tmpData.data() + index, tmpData.data() + index + varSize, + fill_val); + index += varSize; + } else { + std::stringstream msg; + msg << "### ERROR: Unable to find dense variable " << var_name << std::endl; + PARTHENON_FAIL(msg); + } } - auto fill_val = - output_params.sparse_seed_nans ? std::numeric_limits::quiet_NaN() : 0; - std::fill(tmpData.data() + index, tmpData.data() + index + varSize, fill_val); - index += varSize; - } else { - std::stringstream msg; - msg << "### ERROR: Unable to find dense variable " << var_name << std::endl; - PARTHENON_FAIL(msg); } - } + } // fill host output buffer + + { // write variable data + PARTHENON_INSTRUMENT_REGION("write_variable_data") + // write data to file + HDF5WriteND(file, var_name, tmpData.data(), ndim, p_loc_offset, p_loc_cnt, + p_glob_cnt, pl_xfer, pl_dcreate); + } // write variable data } - Kokkos::Profiling::popRegion(); // fill host output buffer - - Kokkos::Profiling::pushRegion("write variable data"); - // write data to file - HDF5WriteND(file, var_name, tmpData.data(), ndim, p_loc_offset, p_loc_cnt, p_glob_cnt, - pl_xfer, pl_dcreate); - Kokkos::Profiling::popRegion(); // write variable data - Kokkos::Profiling::popRegion(); // write variable loop - } - Kokkos::Profiling::popRegion(); // write all variable data + } // write all variable data // names of variables std::vector var_names; @@ -593,7 +592,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // write SparseInfo and SparseFields (we can't write a zero-size dataset, so only write // this if we have sparse fields) if (num_sparse > 0) { - Kokkos::Profiling::pushRegion("write sparse info"); + PARTHENON_INSTRUMENT_REGION("write_sparse_info") local_count[1] = global_count[1] = num_sparse; HDF5Write2D(file, "SparseInfo", sparse_allocated.get(), p_loc_offset, p_loc_cnt, @@ -606,82 +605,81 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm const H5D dset = H5D::FromHIDCheck(H5Dopen2(file, "SparseInfo", H5P_DEFAULT)); HDF5WriteAttribute("SparseFields", names, dset); - Kokkos::Profiling::popRegion(); // write sparse info - } // SparseInfo and SparseFields sections + } // SparseInfo and SparseFields sections // -------------------------------------------------------------------------------- // // WRITING PARTICLE DATA // // -------------------------------------------------------------------------------- // - Kokkos::Profiling::pushRegion("write particle data"); AllSwarmInfo swarm_info(pm->block_list, output_params.swarms, restart_); - for (auto &[swname, swinfo] : swarm_info.all_info) { - const H5G g_swm = MakeGroup(file, swname); - // offsets/counts are NOT the same here vs the grid data - hsize_t local_offset[6] = {static_cast(my_offset), 0, 0, 0, 0, 0}; - hsize_t local_count[6] = {static_cast(num_blocks_local), 0, 0, 0, 0, 0}; - hsize_t global_count[6] = {static_cast(max_blocks_global), 0, 0, 0, 0, 0}; - // These indicate particles/meshblock and location in global index - // space where each meshblock starts - HDF5Write1D(g_swm, "counts", swinfo.counts.data(), local_offset, local_count, - global_count, pl_xfer); - HDF5Write1D(g_swm, "offsets", swinfo.offsets.data(), local_offset, local_count, - global_count, pl_xfer); - - const H5G g_var = MakeGroup(g_swm, "SwarmVars"); - if (swinfo.global_count == 0) { - continue; - } - auto SetCounts = [&](const SwarmInfo &swinfo, const SwarmVarInfo &vinfo) { - const int rank = vinfo.tensor_rank; - for (int i = 0; i < 6; ++i) { - local_offset[i] = 0; // reset everything - local_count[i] = 0; - global_count[i] = 0; + { // write particle data + PARTHENON_INSTRUMENT_REGION("write_particle_data") + for (auto &[swname, swinfo] : swarm_info.all_info) { + const H5G g_swm = MakeGroup(file, swname); + // offsets/counts are NOT the same here vs the grid data + hsize_t local_offset[6] = {static_cast(my_offset), 0, 0, 0, 0, 0}; + hsize_t local_count[6] = {static_cast(num_blocks_local), 0, 0, 0, 0, 0}; + hsize_t global_count[6] = {static_cast(max_blocks_global), 0, 0, 0, 0, 0}; + // These indicate particles/meshblock and location in global index + // space where each meshblock starts + HDF5Write1D(g_swm, "counts", swinfo.counts.data(), local_offset, local_count, + global_count, pl_xfer); + HDF5Write1D(g_swm, "offsets", swinfo.offsets.data(), local_offset, local_count, + global_count, pl_xfer); + + const H5G g_var = MakeGroup(g_swm, "SwarmVars"); + if (swinfo.global_count == 0) { + continue; } - for (int i = 0; i < rank; ++i) { - local_count[i] = global_count[i] = vinfo.GetN(rank + 1 - i); + auto SetCounts = [&](const SwarmInfo &swinfo, const SwarmVarInfo &vinfo) { + const int rank = vinfo.tensor_rank; + for (int i = 0; i < 6; ++i) { + local_offset[i] = 0; // reset everything + local_count[i] = 0; + global_count[i] = 0; + } + for (int i = 0; i < rank; ++i) { + local_count[i] = global_count[i] = vinfo.GetN(rank + 1 - i); + } + local_offset[rank] = swinfo.global_offset; + local_count[rank] = swinfo.count_on_rank; + global_count[rank] = swinfo.global_count; + }; + auto &int_vars = std::get>(swinfo.vars); + for (auto &[vname, swmvarvec] : int_vars) { + const auto &vinfo = swinfo.var_info.at(vname); + auto host_data = swinfo.FillHostBuffer(vname, swmvarvec); + SetCounts(swinfo, vinfo); + HDF5WriteND(g_var, vname, host_data.data(), vinfo.tensor_rank + 1, local_offset, + local_count, global_count, pl_xfer, H5P_DEFAULT); + } + auto &rvars = std::get>(swinfo.vars); + for (auto &[vname, swmvarvec] : rvars) { + const auto &vinfo = swinfo.var_info.at(vname); + auto host_data = swinfo.FillHostBuffer(vname, swmvarvec); + SetCounts(swinfo, vinfo); + HDF5WriteND(g_var, vname, host_data.data(), vinfo.tensor_rank + 1, local_offset, + local_count, global_count, pl_xfer, H5P_DEFAULT); + } + // If swarm does not contain an "id" object, generate a sequential + // one for vis. + if (swinfo.var_info.count("id") == 0) { + std::vector ids(swinfo.global_count); + std::iota(std::begin(ids), std::end(ids), swinfo.global_offset); + local_offset[0] = swinfo.global_offset; + local_count[0] = swinfo.count_on_rank; + global_count[0] = swinfo.global_count; + HDF5Write1D(g_var, "id", ids.data(), local_offset, local_count, global_count, + pl_xfer); } - local_offset[rank] = swinfo.global_offset; - local_count[rank] = swinfo.count_on_rank; - global_count[rank] = swinfo.global_count; - }; - auto &int_vars = std::get>(swinfo.vars); - for (auto &[vname, swmvarvec] : int_vars) { - const auto &vinfo = swinfo.var_info.at(vname); - auto host_data = swinfo.FillHostBuffer(vname, swmvarvec); - SetCounts(swinfo, vinfo); - HDF5WriteND(g_var, vname, host_data.data(), vinfo.tensor_rank + 1, local_offset, - local_count, global_count, pl_xfer, H5P_DEFAULT); - } - auto &rvars = std::get>(swinfo.vars); - for (auto &[vname, swmvarvec] : rvars) { - const auto &vinfo = swinfo.var_info.at(vname); - auto host_data = swinfo.FillHostBuffer(vname, swmvarvec); - SetCounts(swinfo, vinfo); - HDF5WriteND(g_var, vname, host_data.data(), vinfo.tensor_rank + 1, local_offset, - local_count, global_count, pl_xfer, H5P_DEFAULT); - } - // If swarm does not contain an "id" object, generate a sequential - // one for vis. - if (swinfo.var_info.count("id") == 0) { - std::vector ids(swinfo.global_count); - std::iota(std::begin(ids), std::end(ids), swinfo.global_offset); - local_offset[0] = swinfo.global_offset; - local_count[0] = swinfo.count_on_rank; - global_count[0] = swinfo.global_count; - HDF5Write1D(g_var, "id", ids.data(), local_offset, local_count, global_count, - pl_xfer); } - } - Kokkos::Profiling::popRegion(); // write particle data - - Kokkos::Profiling::pushRegion("genXDMF"); - // generate XDMF companion file - XDMF::genXDMF(filename, pm, tm, nx1, nx2, nx3, all_vars_info, swarm_info); - Kokkos::Profiling::popRegion(); // genXDMF + } // write particle data - Kokkos::Profiling::popRegion(); // WriteOutputFile???Prec + { // generate xdmf + PARTHENON_INSTRUMENT_REGION("genXDMF") + // generate XDMF companion file + XDMF::genXDMF(filename, pm, tm, nx1, nx2, nx3, all_vars_info, swarm_info); + } // generate xdmf } std::string PHDF5Output::GenerateFilename_(ParameterInput *pin, SimTime *tm, diff --git a/src/parthenon/prelude.hpp b/src/parthenon/prelude.hpp index 8fafb2c94204..6f28cdcb9ee0 100644 --- a/src/parthenon/prelude.hpp +++ b/src/parthenon/prelude.hpp @@ -33,6 +33,7 @@ namespace prelude { using ::parthenon::BoundaryCommSubset; using ::parthenon::IndexDomain; using ::parthenon::IndexRange; +using ::parthenon::KokkosTimer; using ::parthenon::MeshBlock; using ::parthenon::MeshBlockData; using ::parthenon::MeshData; From 140ca4d6b24ec107831fc9fd695baac2b7b4b609 Mon Sep 17 00:00:00 2001 From: jdolence Date: Tue, 31 Oct 2023 11:02:50 -0600 Subject: [PATCH 02/16] actually add the new file --- src/utils/instrument.hpp | 43 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 src/utils/instrument.hpp diff --git a/src/utils/instrument.hpp b/src/utils/instrument.hpp new file mode 100644 index 000000000000..3fa3fcb4f7c4 --- /dev/null +++ b/src/utils/instrument.hpp @@ -0,0 +1,43 @@ +//======================================================================================== +// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved. +// +// This program was produced under U.S. Government contract 89233218CNA000001 for Los +// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC +// for the U.S. Department of Energy/National Nuclear Security Administration. All rights +// in the program are reserved by Triad National Security, LLC, and the U.S. Department +// of Energy/National Nuclear Security Administration. The Government is granted for +// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide +// license in this material to reproduce, prepare derivative works, distribute copies to +// the public, perform publicly and display publicly, and to permit others to do so. +//======================================================================================== +#ifndef UTILS_INSTRUMENT_HPP_ +#define UTILS_INSTRUMENT_HPP_ + +#include + +#include + +#define PARTHENON_INSTRUMENT KokkosTimer internal_inst_(__FILE__, __LINE__, __func__); +#define PARTHENON_INSTRUMENT_REGION(name) \ + KokkosTimer internal_inst_reg____LINE__(__FILE__, __LINE__, name); + +namespace parthenon { + +struct KokkosTimer { + KokkosTimer(const std::string &file, const int line, const std::string &name) { + std::string label; + auto npos = file.rfind('/'); + if (npos == std::string::npos) { + label = file; + } else { + label = file.substr(npos + 1); + } + label += "::" + std::to_string(line) + "::" + name; + Kokkos::Profiling::pushRegion(label); + } + ~KokkosTimer() { Kokkos::Profiling::popRegion(); } +}; + +} // namespace parthenon + +#endif From 4856d8b189238ff3993b8de728975d51cedc915b Mon Sep 17 00:00:00 2001 From: jdolence Date: Tue, 31 Oct 2023 11:03:31 -0600 Subject: [PATCH 03/16] and add it to the cmake --- src/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 54a226c55383..062702836532 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -231,6 +231,7 @@ add_library(parthenon utils/error_checking.cpp utils/hash.hpp utils/indexer.hpp + utils/instrument.hpp utils/loop_utils.hpp utils/morton_number.hpp utils/mpi_types.hpp From 27dffb46126af032150c43fbffddc27b04b66cf6 Mon Sep 17 00:00:00 2001 From: jdolence Date: Tue, 31 Oct 2023 16:40:28 -0600 Subject: [PATCH 04/16] add some more macros for auto profiling --- benchmarks/burgers/burgers_package.cpp | 6 ++-- benchmarks/burgers/parthenon_app_inputs.cpp | 2 +- example/advection/advection_package.cpp | 31 ++++++++-------- example/advection/parthenon_app_inputs.cpp | 5 ++- example/calculate_pi/calculate_pi.cpp | 2 +- example/kokkos_pi/kokkos_pi.cpp | 2 +- .../particle_leapfrog/particle_leapfrog.cpp | 4 +-- example/particle_tracers/particle_tracers.cpp | 16 ++++----- example/particles/particles.cpp | 17 +++++---- example/poisson/parthenon_app_inputs.cpp | 4 +-- example/poisson/poisson_package.cpp | 26 +++++++------- .../sparse_advection/parthenon_app_inputs.cpp | 4 +-- .../sparse_advection_package.cpp | 8 ++--- .../stochastic_subgrid_package.cpp | 21 +++++------ src/bvals/comms/boundary_communication.cpp | 4 +-- src/bvals/comms/flux_correction.cpp | 4 +-- src/driver/driver.cpp | 2 +- src/interface/swarm.cpp | 15 ++++---- src/interface/update.cpp | 12 +++---- src/interface/update.hpp | 24 ++++++------- src/mesh/amr_loadbalance.cpp | 16 ++++----- src/outputs/ascent.cpp | 2 +- src/outputs/parthenon_hdf5.cpp | 30 ++++++++-------- src/prolong_restrict/pr_loops.hpp | 8 ++--- src/utils/buffer_utils.cpp | 10 +++--- src/utils/instrument.hpp | 35 +++++++++++++------ 26 files changed, 162 insertions(+), 148 deletions(-) diff --git a/benchmarks/burgers/burgers_package.cpp b/benchmarks/burgers/burgers_package.cpp index b66bc69c2430..4c2c46a71d40 100644 --- a/benchmarks/burgers/burgers_package.cpp +++ b/benchmarks/burgers/burgers_package.cpp @@ -110,7 +110,7 @@ void CalculateDerived(MeshData *md) { size_t scratch_size = 0; constexpr int scratch_level = 0; parthenon::par_for_outer( - DEFAULT_OUTER_LOOP_PATTERN, "CalculateDerived", DevExecSpace(), scratch_size, + DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), scratch_size, scratch_level, 0, nblocks - 1, kb.s, kb.e, jb.s, jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k, const int j) { Real *out = &v(b, 0, k, j, 0); @@ -193,7 +193,7 @@ TaskStatus CalculateFluxes(MeshData *md) { size_t scratch_size = 0; constexpr int scratch_level = 0; parthenon::par_for_outer( - DEFAULT_OUTER_LOOP_PATTERN, "burgers::reconstruction", DevExecSpace(), scratch_size, + DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), scratch_size, scratch_level, 0, nblocks - 1, kb.s - dk, kb.e + dk, jb.s - dj, jb.e + dj, KOKKOS_LAMBDA(team_mbr_t member, const int b, const int k, const int j) { bool xrec = (k >= kb.s && k <= kb.e) && (j >= jb.s && j <= jb.e); @@ -264,7 +264,7 @@ TaskStatus CalculateFluxes(MeshData *md) { // now we'll solve the Riemann problems to get fluxes scratch_size = 2 * ScratchPad1D::shmem_size(ib.e + 1); parthenon::par_for_outer( - DEFAULT_OUTER_LOOP_PATTERN, "burgers::reconstruction", DevExecSpace(), scratch_size, + DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), scratch_size, scratch_level, 0, nblocks - 1, kb.s, kb.e + dk, jb.s, jb.e + dj, KOKKOS_LAMBDA(team_mbr_t member, const int b, const int k, const int j) { bool xflux = (k <= kb.e && j <= jb.e); diff --git a/benchmarks/burgers/parthenon_app_inputs.cpp b/benchmarks/burgers/parthenon_app_inputs.cpp index 94d24b8e2068..c831d4ef7802 100644 --- a/benchmarks/burgers/parthenon_app_inputs.cpp +++ b/benchmarks/burgers/parthenon_app_inputs.cpp @@ -53,7 +53,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) { const auto num_vars = q.GetDim(4); pmb->par_for( - "Burgers::ProblemGenerator", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int k, const int j, const int i) { const Real x = coords.Xc<1>(i); const Real y = coords.Xc<2>(j); diff --git a/example/advection/advection_package.cpp b/example/advection/advection_package.cpp index 52f8d2c69bda..210b2d474872 100644 --- a/example/advection/advection_package.cpp +++ b/example/advection/advection_package.cpp @@ -237,8 +237,7 @@ AmrTag CheckRefinement(MeshBlockData *rc) { typename Kokkos::MinMax::value_type minmax; pmb->par_reduce( - "advection check refinement", 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, - ib.e, + PARTHENON_AUTO_LABEL, 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i, typename Kokkos::MinMax::value_type &lminmax) { lminmax.min_val = @@ -276,7 +275,7 @@ void PreFill(MeshBlockData *rc) { const int out = imap.get("one_minus_advected").first; const auto num_vars = rc->Get("advected").data.GetDim(4); pmb->par_for( - "advection_package::PreFill", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { v(out + n, k, j, i) = 1.0 - v(in + n, k, j, i); }); @@ -300,7 +299,7 @@ void SquareIt(MeshBlockData *rc) { const int out = imap.get("one_minus_advected_sq").first; const auto num_vars = rc->Get("advected").data.GetDim(4); pmb->par_for( - "advection_package::SquareIt", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { v(out + n, k, j, i) = v(in + n, k, j, i) * v(in + n, k, j, i); }); @@ -317,8 +316,8 @@ void SquareIt(MeshBlockData *rc) { if (profile == "smooth_gaussian") { const auto &advected = rc->Get("advected").data; pmb->par_for( - "advection_package::SquareIt bval check", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, - ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { + PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { PARTHENON_REQUIRE(advected(n, k, j, i) != 0.0, "Advected not properly initialized."); }); @@ -353,8 +352,8 @@ void PostFill(MeshBlockData *rc) { const int out37 = imap.get("one_minus_sqrt_one_minus_advected_sq_37").first; const auto num_vars = rc->Get("advected").data.GetDim(4); pmb->par_for( - "advection_package::PostFill", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, - ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { + PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { v(out12 + n, k, j, i) = 1.0 - sqrt(v(in + n, k, j, i)); v(out37 + n, k, j, i) = 1.0 - v(out12 + n, k, j, i); }); @@ -387,7 +386,8 @@ Real AdvectionHst(MeshData *md) { const bool volume_weighting = std::is_same>::value; pmb->par_reduce( - "AdvectionHst", 0, advected_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, 0, advected_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, + ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lresult) { const auto &coords = advected_pack.GetCoords(b); // `join` is a function of the Kokkos::ReducerConecpt that allows to use the same @@ -418,7 +418,7 @@ Real EstimateTimestepBlock(MeshBlockData *rc) { // this is obviously overkill for this constant velocity problem Real min_dt; pmb->par_reduce( - "advection_package::EstimateTimestep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int k, const int j, const int i, Real &lmin_dt) { if (vx != 0.0) lmin_dt = std::min(lmin_dt, coords.Dxc(k, j, i) / std::abs(vx)); @@ -465,8 +465,8 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { size_t scratch_size_in_bytes = parthenon::ScratchPad2D::shmem_size(nvar, nx1); // get x-fluxes pmb->par_for_outer( - "x1 flux", 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e, - KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { + PARTHENON_AUTO_LABEL, 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, + jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { parthenon::ScratchPad2D ql(member.team_scratch(scratch_level), nvar, nx1); parthenon::ScratchPad2D qr(member.team_scratch(scratch_level), nvar, nx1); // get reconstructed state on faces @@ -498,8 +498,8 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { // get y-fluxes if (pmb->pmy_mesh->ndim >= 2) { pmb->par_for_outer( - "x2 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e + 1, - KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { + PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, + jb.e + 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { // the overall algorithm/use of scratch pad here is clear inefficient and kept // just for demonstrating purposes. The key point is that we cannot reuse // reconstructed arrays for different `j` with `j` being part of the outer @@ -541,7 +541,8 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { // get z-fluxes if (pmb->pmy_mesh->ndim == 3) { pmb->par_for_outer( - "x3 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1, jb.s, jb.e, + PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1, + jb.s, jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { // the overall algorithm/use of scratch pad here is clear inefficient and kept // just for demonstrating purposes. The key point is that we cannot reuse diff --git a/example/advection/parthenon_app_inputs.cpp b/example/advection/parthenon_app_inputs.cpp index d55e0243b9ff..43bb74292f06 100644 --- a/example/advection/parthenon_app_inputs.cpp +++ b/example/advection/parthenon_app_inputs.cpp @@ -73,7 +73,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) { if (profile == "block") profile_type = 3; pmb->par_for( - "Advection::ProblemGenerator", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { if (profile_type == 0) { Real x = cos_a2 * (coords.Xc<1>(i) * cos_a3 + coords.Xc<2>(j) * sin_a3) + @@ -99,8 +99,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) { // initialize some arbitrary cells in the first block that move in all 6 directions if (profile_type == 3 && block_id == 0) { pmb->par_for( - "Advection::ProblemGenerator bvals test", 0, 1, - KOKKOS_LAMBDA(const int /*unused*/) { + PARTHENON_AUTO_LABEL, 0, 1, KOKKOS_LAMBDA(const int /*unused*/) { q(idx_adv, 4, 4, 4) = 10.0; q(idx_v, 4, 4, 4) = vx; q(idx_adv, 4, 6, 4) = 10.0; diff --git a/example/calculate_pi/calculate_pi.cpp b/example/calculate_pi/calculate_pi.cpp index 03ef92fd6872..06d6ba3c1984 100644 --- a/example/calculate_pi/calculate_pi.cpp +++ b/example/calculate_pi/calculate_pi.cpp @@ -88,7 +88,7 @@ void SetInOrOut(MeshBlockData *rc) { // Loop bounds are set to catch the case where the edge is between the // cell centers of the first/last real cell and the first ghost cell pmb->par_for( - "SetInOrOut", kb.s, kb.e, jb.s - 1, jb.e + 1, ib.s - 1, ib.e + 1, + PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s - 1, jb.e + 1, ib.s - 1, ib.e + 1, KOKKOS_LAMBDA(const int k, const int j, const int i) { Real rsq = std::pow(coords.Xc<1>(i), 2) + std::pow(coords.Xc<2>(j), 2); if (rsq < radius * radius) { diff --git a/example/kokkos_pi/kokkos_pi.cpp b/example/kokkos_pi/kokkos_pi.cpp index 4bcdbd8d3819..6d5f69c980bc 100644 --- a/example/kokkos_pi/kokkos_pi.cpp +++ b/example/kokkos_pi/kokkos_pi.cpp @@ -272,7 +272,7 @@ result_t naiveParFor(int n_block, int n_mesh, int n_iter, double radius) { auto inOrOut = base->PackVariables({Metadata::Independent}); // iops = 0 fops = 11 par_for( - DEFAULT_LOOP_PATTERN, "par_for in or out", DevExecSpace(), 0, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, inOrOut.GetDim(4) - 1, nghost, inOrOut.GetDim(3) - nghost - 1, nghost, inOrOut.GetDim(2) - nghost - 1, nghost, inOrOut.GetDim(1) - nghost - 1, KOKKOS_LAMBDA(const int l, const int k_grid, const int j_grid, diff --git a/example/particle_leapfrog/particle_leapfrog.cpp b/example/particle_leapfrog/particle_leapfrog.cpp index b54bd6ddebd3..673e795fdead 100644 --- a/example/particle_leapfrog/particle_leapfrog.cpp +++ b/example/particle_leapfrog/particle_leapfrog.cpp @@ -189,7 +189,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) { // This hardcoded implementation should only used in PGEN and not during runtime // addition of particles as indices need to be taken into account. pmb->par_for( - "CreateParticles", 0, num_particles_this_block - 1, KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, num_particles_this_block - 1, KOKKOS_LAMBDA(const int n) { const auto &m = ids_this_block(n); id(n) = m; // global unique id @@ -227,7 +227,7 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator const Real ay = 0.0; const Real az = 0.0; pmb->par_for( - "Leapfrog", 0, max_active_index, KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) { if (swarm_d.IsActive(n)) { // drift x(n) += v(0, n) * 0.5 * dt; diff --git a/example/particle_tracers/particle_tracers.cpp b/example/particle_tracers/particle_tracers.cpp index 1f8394a6c6c4..9cc8a0cb2570 100644 --- a/example/particle_tracers/particle_tracers.cpp +++ b/example/particle_tracers/particle_tracers.cpp @@ -182,7 +182,7 @@ TaskStatus AdvectTracers(MeshBlock *pmb, const StagedIntegrator *integrator) { auto swarm_d = swarm->GetDeviceContext(); pmb->par_for( - "Tracer advection", 0, max_active_index, KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) { if (swarm_d.IsActive(n)) { x(n) += vx * dt; y(n) += vy * dt; @@ -219,13 +219,13 @@ TaskStatus DepositTracers(MeshBlock *pmb) { auto &tracer_dep = pmb->meshblock_data.Get()->Get("tracer_deposition").data; // Reset particle count pmb->par_for( - "ZeroParticleDep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int k, const int j, const int i) { tracer_dep(k, j, i) = 0.; }); const int ndim = pmb->pmy_mesh->ndim; pmb->par_for( - "DepositTracers", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) { if (swarm_d.IsActive(n)) { int i = static_cast(std::floor((x(n) - minx_i) / dx_i) + ib.s); int j = 0; @@ -269,7 +269,7 @@ TaskStatus CalculateFluxes(MeshBlockData *mbd) { // Spatially first order upwind method pmb->par_for( - "CalculateFluxesX1", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e + 1, + PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e + 1, KOKKOS_LAMBDA(const int k, const int j, const int i) { // X1 if (vx > 0.) { @@ -282,7 +282,7 @@ TaskStatus CalculateFluxes(MeshBlockData *mbd) { if (ndim > 1) { auto x2flux = mbd->Get("advected").flux[X2DIR].Get<4>(); pmb->par_for( - "CalculateFluxesX2", kb.s, kb.e, jb.s, jb.e + 1, ib.s, ib.e, + PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e + 1, ib.s, ib.e, KOKKOS_LAMBDA(const int k, const int j, const int i) { // X2 if (vy > 0.) { @@ -296,7 +296,7 @@ TaskStatus CalculateFluxes(MeshBlockData *mbd) { if (ndim > 2) { auto x3flux = mbd->Get("advected").flux[X3DIR].Get<4>(); pmb->par_for( - "CalculateFluxesX3", kb.s, kb.e + 1, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, kb.s, kb.e + 1, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int k, const int j, const int i) { // X3 if (vz > 0.) { @@ -355,7 +355,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) { const Real kwave = 2. * M_PI / (x_max_mesh - x_min_mesh); pmb->par_for( - "Init advected profile", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int k, const int j, const int i) { advected(k, j, i) = advected_mean + advected_amp * sin(kwave * coords.Xc<1>(i)); }); @@ -387,7 +387,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) { // This hardcoded implementation should only used in PGEN and not during runtime // addition of particles as indices need to be taken into account. pmb->par_for( - "CreateParticles", 0, num_tracers_meshblock - 1, KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, num_tracers_meshblock - 1, KOKKOS_LAMBDA(const int n) { auto rng_gen = rng_pool.get_state(); // Rejection sample the x position diff --git a/example/particles/particles.cpp b/example/particles/particles.cpp index 5a112977cc23..384a42ddc432 100644 --- a/example/particles/particles.cpp +++ b/example/particles/particles.cpp @@ -143,7 +143,7 @@ TaskStatus DestroySomeParticles(MeshBlock *pmb) { // Randomly mark some fraction of particles each timestep for removal pmb->par_for( - "DestroySomeParticles", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) { if (swarm_d.IsActive(n)) { auto rng_gen = rng_pool.get_state(); if (rng_gen.drand() > 1.0 - destroy_particles_frac) { @@ -201,13 +201,13 @@ TaskStatus DepositParticles(MeshBlock *pmb) { if (deposition_method == DepositionMethod::per_particle) { // Reset particle count pmb->par_for( - "ZeroParticleDep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int k, const int j, const int i) { particle_dep(k, j, i) = 0.; }); pmb->par_for( - "DepositParticles", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) { if (swarm_d.IsActive(n)) { int i = static_cast(std::floor((x(n) - minx_i) / dx_i) + ib.s); int j = 0; @@ -227,7 +227,7 @@ TaskStatus DepositParticles(MeshBlock *pmb) { }); } else if (deposition_method == DepositionMethod::per_cell) { pmb->par_for( - "DepositParticlesByCell", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int k, const int j, const int i) { particle_dep(k, j, i) = 0.; for (int n = 0; n < swarm_d.GetParticleCountPerCell(k, j, i); n++) { @@ -278,8 +278,7 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) { if (orbiting_particles) { pmb->par_for( - "CreateSomeOrbitingParticles", 0, swarm->GetMaxActiveIndex(), - KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) { if (new_particles_mask(n)) { auto rng_gen = rng_pool.get_state(); @@ -323,7 +322,7 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) { }); } else { pmb->par_for( - "CreateSomeParticles", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) { if (new_particles_mask(n)) { auto rng_gen = rng_pool.get_state(); @@ -393,7 +392,7 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator if (orbiting_particles) { // Particles orbit the origin pmb->par_for( - "TransportOrbitingParticles", 0, max_active_index, KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) { if (swarm_d.IsActive(n)) { Real vel = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n)); PARTHENON_DEBUG_REQUIRE(vel > 0., "Speed must be > 0!"); @@ -446,7 +445,7 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator } else { // Particles move in straight lines pmb->par_for( - "TransportParticles", 0, max_active_index, KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) { if (swarm_d.IsActive(n)) { Real vel = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n)); PARTHENON_DEBUG_REQUIRE(vel > 0., "vel must be > 0 for division!"); diff --git a/example/poisson/parthenon_app_inputs.cpp b/example/poisson/parthenon_app_inputs.cpp index 4c2bd177c42f..2dd29551f350 100644 --- a/example/poisson/parthenon_app_inputs.cpp +++ b/example/poisson/parthenon_app_inputs.cpp @@ -50,8 +50,8 @@ void ProblemGenerator(Mesh *pm, ParameterInput *pin, MeshData *md) { const int iphi = imap["potential"].first; pmb->par_for( - "Poisson::ProblemGenerator", 0, q_bpack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, - ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) { + PARTHENON_AUTO_LABEL, 0, q_bpack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) { const auto &coords = q_bpack.GetCoords(b); auto &q = q_bpack(b); Real dist2 = std::pow(coords.Xc<1>(i) - x0, 2) + diff --git a/example/poisson/poisson_package.cpp b/example/poisson/poisson_package.cpp index 378bc48a3176..032d0decffd5 100644 --- a/example/poisson/poisson_package.cpp +++ b/example/poisson/poisson_package.cpp @@ -129,8 +129,8 @@ TaskStatus SetMatrixElements(T *u) { const int ndim = v.GetNdim(); const Real w0 = -2.0 * ndim; parthenon::par_for( - DEFAULT_LOOP_PATTERN, "SetMatElem", DevExecSpace(), 0, v.GetDim(5) - 1, kb.s, kb.e, - jb.s, jb.e, ib.s, ib.e, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, v.GetDim(5) - 1, + kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) { for (int n = isp_lo; n <= isp_hi; n++) { v(b, n, k, j, i) = 1; @@ -168,8 +168,8 @@ TaskStatus SumMass(T *u, Real *reduce_sum) { Real total; parthenon::par_reduce( - parthenon::loop_pattern_mdrange_tag, "SumMass", DevExecSpace(), 0, v.GetDim(5) - 1, - kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, + v.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &sum) { sum += v(b, irho, k, j, i) * std::pow(dx, ndim); }, @@ -194,8 +194,8 @@ TaskStatus SumDeltaPhi(T *du, Real *reduce_sum) { Real total; parthenon::par_reduce( - parthenon::loop_pattern_mdrange_tag, "SumMass", DevExecSpace(), 0, dv.GetDim(5) - 1, - kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, + dv.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &sum) { sum += std::pow(dv(b, iphi, k, j, i), 2); }, @@ -245,8 +245,8 @@ TaskStatus UpdatePhi(T *u, T *du) { if (isp_hi < 0) { // there is no sparse matrix, so we must be using the stencil const auto &stencil = pkg->Param("stencil"); parthenon::par_for( - DEFAULT_LOOP_PATTERN, "StencilJacobi", DevExecSpace(), 0, v.GetDim(5) - 1, kb.s, - kb.e, jb.s, jb.e, ib.s, ib.e, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, v.GetDim(5) - 1, + kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) { const Real rhs = dV * v(b, irho, k, j, i); const Real phi_new = stencil.Jacobi(v, iphi, b, k, j, i, rhs); @@ -257,8 +257,8 @@ TaskStatus UpdatePhi(T *u, T *du) { const auto &sp_accessor = pkg->Param("sparse_accessor"); parthenon::par_for( - DEFAULT_LOOP_PATTERN, "SparseUpdate", DevExecSpace(), 0, v.GetDim(5) - 1, kb.s, - kb.e, jb.s, jb.e, ib.s, ib.e, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, v.GetDim(5) - 1, + kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) { const Real rhs = dV * v(b, irho, k, j, i); const Real phi_new = @@ -268,8 +268,8 @@ TaskStatus UpdatePhi(T *u, T *du) { } parthenon::par_for( - DEFAULT_LOOP_PATTERN, "UpdatePhi", DevExecSpace(), 0, dv.GetDim(5) - 1, kb.s, kb.e, - jb.s, jb.e, ib.s, ib.e, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, dv.GetDim(5) - 1, + kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) { v(b, iphi, k, j, i) += dv(b, idphi, k, j, i); }); @@ -296,7 +296,7 @@ TaskStatus CheckConvergence(T *u, T *du) { Real max_err; parthenon::par_reduce( - parthenon::loop_pattern_mdrange_tag, "CheckConvergence", DevExecSpace(), 0, + parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, v.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &eps) { Real reps = std::abs(dv(b, idphi, k, j, i) / v(b, iphi, k, j, i)); diff --git a/example/sparse_advection/parthenon_app_inputs.cpp b/example/sparse_advection/parthenon_app_inputs.cpp index 1cd806accad6..0f9730d7f718 100644 --- a/example/sparse_advection/parthenon_app_inputs.cpp +++ b/example/sparse_advection/parthenon_app_inputs.cpp @@ -97,8 +97,8 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) { } pmb->par_for( - "SparseAdvection::ProblemGenerator", 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, - ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { + PARTHENON_AUTO_LABEL, 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { auto x = coords.Xc<1>(i) - x0; auto y = coords.Xc<2>(j) - y0; auto z = coords.Xc<3>(k); diff --git a/example/sparse_advection/sparse_advection_package.cpp b/example/sparse_advection/sparse_advection_package.cpp index b3f8130cbe20..0ffeea6ca96d 100644 --- a/example/sparse_advection/sparse_advection_package.cpp +++ b/example/sparse_advection/sparse_advection_package.cpp @@ -203,8 +203,8 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { size_t scratch_size_in_bytes = parthenon::ScratchPad2D::shmem_size(nvar, nx1); // get x-fluxes pmb->par_for_outer( - "x1 flux", 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e, - KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { + PARTHENON_AUTO_LABEL, 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, + jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { parthenon::ScratchPad2D ql(member.team_scratch(scratch_level), nvar, nx1); parthenon::ScratchPad2D qr(member.team_scratch(scratch_level), nvar, nx1); @@ -226,8 +226,8 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { // get y-fluxes if (pmb->pmy_mesh->ndim >= 2) { pmb->par_for_outer( - "x2 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e + 1, - KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { + PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, + jb.e + 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { // the overall algorithm/use of scratch pad here is clearly inefficient and kept // just for demonstrating purposes. The key point is that we cannot reuse // reconstructed arrays for different `j` with `j` being part of the outer diff --git a/example/stochastic_subgrid/stochastic_subgrid_package.cpp b/example/stochastic_subgrid/stochastic_subgrid_package.cpp index 79be429a3232..f91bf0ad7b75 100644 --- a/example/stochastic_subgrid/stochastic_subgrid_package.cpp +++ b/example/stochastic_subgrid/stochastic_subgrid_package.cpp @@ -262,9 +262,9 @@ TaskStatus ComputeNumIter(std::shared_ptr> &md, Packages_t &packa int N_min = pkg->Param("N_min"); par_for( - parthenon::loop_pattern_mdrange_tag, "ComputeNumIter", parthenon::DevExecSpace(), 0, - pack.GetDim(5) - 1, 0, pack.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, - KOKKOS_LAMBDA(int b, int v, int k, int j, int i) { + parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, + parthenon::DevExecSpace(), 0, pack.GetDim(5) - 1, 0, pack.GetDim(4) - 1, kb.s, kb.e, + jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(int b, int v, int k, int j, int i) { auto rng = pool.get_state(); double rand1 = rng.drand(); double rand2 = rng.drand(); @@ -302,7 +302,7 @@ void DoLotsOfWork(MeshBlockData *rc) { const Real ilog10 = 1.0 / log(10.0); pmb->par_for( - "stochastic_subgrid_package::DoLotsOfWork", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int k, const int j, const int i) { int num_iter = v(niter, k, j, i); @@ -343,7 +343,7 @@ Real EstimateTimestepBlock(MeshBlockData *rc) { // this is obviously overkill for this constant velocity problem Real min_dt; pmb->par_reduce( - "stochastic_subgrid_package::EstimateTimestep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int k, const int j, const int i, Real &lmin_dt) { if (vx != 0.0) lmin_dt = std::min(lmin_dt, coords.Dxc(k, j, i) / std::abs(vx)); @@ -380,8 +380,8 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { parthenon::ParArray4D x1flux = rc->Get("advected").flux[X1DIR].Get<4>(); // get x-fluxes pmb->par_for_outer( - "x1 flux", 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e, - KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { + PARTHENON_AUTO_LABEL, 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, + jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { parthenon::ScratchPad2D ql(member.team_scratch(scratch_level), nvar, nx1); parthenon::ScratchPad2D qr(member.team_scratch(scratch_level), nvar, nx1); // get reconstructed state on faces @@ -404,8 +404,8 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { if (pmb->pmy_mesh->ndim >= 2) { parthenon::ParArray4D x2flux = rc->Get("advected").flux[X2DIR].Get<4>(); pmb->par_for_outer( - "x2 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e + 1, - KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { + PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, + jb.e + 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { // the overall algorithm/use of scratch pad here is clear inefficient and kept // just for demonstrating purposes. The key point is that we cannot reuse // reconstructed arrays for different `j` with `j` being part of the outer @@ -436,7 +436,8 @@ TaskStatus CalculateFluxes(std::shared_ptr> &rc) { if (pmb->pmy_mesh->ndim == 3) { parthenon::ParArray4D x3flux = rc->Get("advected").flux[X3DIR].Get<4>(); pmb->par_for_outer( - "x3 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1, jb.s, jb.e, + PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1, + jb.s, jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) { // the overall algorithm/use of scratch pad here is clear inefficient and kept // just for demonstrating purposes. The key point is that we cannot reuse diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp index 4f26a9f52809..7c61881308b6 100644 --- a/src/bvals/comms/boundary_communication.cpp +++ b/src/bvals/comms/boundary_communication.cpp @@ -82,7 +82,7 @@ TaskStatus SendBoundBufs(std::shared_ptr> &md) { auto &sending_nonzero_flags_h = cache.sending_non_zero_flags_h; Kokkos::parallel_for( - "SendBoundBufs", + PARTHENON_AUTO_LABEL, Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO), KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) { const int b = team_member.league_rank(); @@ -231,7 +231,7 @@ TaskStatus SetBounds(std::shared_ptr> &md) { // const Real threshold = Globals::sparse_config.allocation_threshold; auto &bnd_info = cache.bnd_info; Kokkos::parallel_for( - "SetBoundaryBuffers", + PARTHENON_AUTO_LABEL, Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO), KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) { const int b = team_member.league_rank(); diff --git a/src/bvals/comms/flux_correction.cpp b/src/bvals/comms/flux_correction.cpp index 50166ec9ca89..ff76bcba0014 100644 --- a/src/bvals/comms/flux_correction.cpp +++ b/src/bvals/comms/flux_correction.cpp @@ -67,7 +67,7 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr> &md) { auto &bnd_info = cache.bnd_info; PARTHENON_REQUIRE(bnd_info.size() == nbound, "Need same size for boundary info"); Kokkos::parallel_for( - "SendFluxCorrectionBufs", + PARTHENON_AUTO_LABEL, Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO), KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) { auto &binfo = bnd_info(team_member.league_rank()); @@ -163,7 +163,7 @@ TaskStatus SetFluxCorrections(std::shared_ptr> &md) { auto &bnd_info = cache.bnd_info; Kokkos::parallel_for( - "SetFluxCorBuffers", + PARTHENON_AUTO_LABEL, Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO), KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) { const int b = team_member.league_rank(); diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp index 8359e6fc9a82..166586003bc4 100644 --- a/src/driver/driver.cpp +++ b/src/driver/driver.cpp @@ -76,7 +76,7 @@ DriverStatus EvolutionDriver::Execute() { DumpInputParameters(); { // Main t < tmax loop region - PARTHENON_INSTRUMENT_REGION("evolution_loop") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) while (tm.KeepGoing()) { if (Globals::my_rank == 0) OutputCycleDiagnostics(); diff --git a/src/interface/swarm.cpp b/src/interface/swarm.cpp index c9db0ced5478..981ded597589 100644 --- a/src/interface/swarm.cpp +++ b/src/interface/swarm.cpp @@ -406,7 +406,7 @@ void Swarm::Defrag() { auto &mask = mask_; pmb->par_for( - "Swarm::DefragMask", 0, max_active_index_, KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) { if (from_to_indices(n) >= 0) { mask(from_to_indices(n)) = mask(n); mask(n) = false; @@ -427,7 +427,7 @@ void Swarm::Defrag() { const int intPackDim = vint.GetDim(2); pmb->par_for( - "Swarm::DefragVariables", 0, max_active_index_, KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) { if (from_to_indices(n) >= 0) { for (int vidx = 0; vidx < realPackDim; vidx++) { vreal(vidx, from_to_indices(n)) = vreal(vidx, n); @@ -477,7 +477,7 @@ void Swarm::SortParticlesByCell() { // Write an unsorted list pmb->par_for( - "Write unsorted list", 0, max_active_index_, KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) { int i, j, k; swarm_d.Xtoijk(x(n), y(n), z(n), i, j, k); const int64_t cell_idx_1d = i + nx1 * (j + nx2 * k); @@ -491,7 +491,7 @@ void Swarm::SortParticlesByCell() { const IndexRange &jb = pmb->cellbounds.GetBoundsJ(IndexDomain::entire); const IndexRange &kb = pmb->cellbounds.GetBoundsK(IndexDomain::entire); pmb->par_for( - "Update per-cell arrays", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int k, const int j, const int i) { int cell_idx_1d = i + nx1 * (j + nx2 * k); // Find starting index, first by guessing @@ -970,7 +970,7 @@ void Swarm::LoadBuffers_(const int max_indices_size) { auto particle_indices_to_send = particle_indices_to_send_; auto neighbor_buffer_index = neighbor_buffer_index_; pmb->par_for( - "Pack Buffers", 0, max_indices_size - 1, + PARTHENON_AUTO_LABEL, 0, max_indices_size - 1, KOKKOS_LAMBDA(const int n) { // Max index for (int m = 0; m < nneighbor; m++) { // Number of neighbors const int bufid = neighbor_buffer_index(m); @@ -1109,7 +1109,8 @@ void Swarm::UnloadBuffers_() { auto swarm_d = GetDeviceContext(); pmb->par_for( - "Unload buffers", 0, total_received_particles_ - 1, KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, total_received_particles_ - 1, + KOKKOS_LAMBDA(const int n) { const int sid = new_indices(n); const int nid = neighbor_index(n); int bid = buffer_index(n) * particle_size; @@ -1137,7 +1138,7 @@ void Swarm::ApplyBoundaries_(const int nparticles, ParArrayND indices) { auto bcs = this->bounds_d; pmb->par_for( - "Swarm::ApplyBoundaries", 0, nparticles - 1, KOKKOS_LAMBDA(const int n) { + PARTHENON_AUTO_LABEL, 0, nparticles - 1, KOKKOS_LAMBDA(const int n) { const int sid = indices(n); for (int l = 0; l < 6; l++) { bcs.bounds[l]->Apply(sid, x(sid), y(sid), z(sid), swarm_d); diff --git a/src/interface/update.cpp b/src/interface/update.cpp index 02c3b632926e..b9a7e6c8f778 100644 --- a/src/interface/update.cpp +++ b/src/interface/update.cpp @@ -48,7 +48,7 @@ TaskStatus FluxDivergence(MeshBlockData *in, MeshBlockData *dudt_con const auto &coords = pmb->coords; const int ndim = pmb->pmy_mesh->ndim; pmb->par_for( - "FluxDivergenceBlock", 0, vin.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + PARTHENON_AUTO_LABEL, 0, vin.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int l, const int k, const int j, const int i) { if (dudt.IsAllocated(l) && vin.IsAllocated(l)) { dudt(l, k, j, i) = FluxDivHelper(l, k, j, i, ndim, coords, vin); @@ -71,7 +71,7 @@ TaskStatus FluxDivergence(MeshData *in_obj, MeshData *dudt_obj) { const int ndim = vin.GetNdim(); parthenon::par_for( - DEFAULT_LOOP_PATTERN, "FluxDivergenceMesh", DevExecSpace(), 0, vin.GetDim(5) - 1, 0, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, vin.GetDim(5) - 1, 0, vin.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int m, const int l, const int k, const int j, const int i) { if (dudt.IsAllocated(m, l) && vin.IsAllocated(m, l)) { @@ -100,8 +100,8 @@ TaskStatus UpdateWithFluxDivergence(MeshBlockData *u0_data, const auto &coords = pmb->coords; const int ndim = pmb->pmy_mesh->ndim; pmb->par_for( - "UpdateWithFluxDivergenceBlock", 0, u0.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, - ib.e, KOKKOS_LAMBDA(const int l, const int k, const int j, const int i) { + PARTHENON_AUTO_LABEL, 0, u0.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + KOKKOS_LAMBDA(const int l, const int k, const int j, const int i) { if (u0.IsAllocated(l) && u1.IsAllocated(l)) { u0(l, k, j, i) = gam0 * u0(l, k, j, i) + gam1 * u1(l, k, j, i) + beta_dt * FluxDivHelper(l, k, j, i, ndim, coords, u0); @@ -126,7 +126,7 @@ TaskStatus UpdateWithFluxDivergence(MeshData *u0_data, MeshData *u1_ const int ndim = u0_pack.GetNdim(); parthenon::par_for( - DEFAULT_LOOP_PATTERN, "UpdateWithFluxDivergenceMesh", DevExecSpace(), 0, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, u0_pack.GetDim(5) - 1, 0, u0_pack.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int m, const int l, const int k, const int j, const int i) { if (u0_pack.IsAllocated(m, l) && u1_pack.IsAllocated(m, l)) { @@ -162,7 +162,7 @@ TaskStatus SparseDealloc(MeshData *md) { const int NjNi = Nj * Ni; const int NkNjNi = Nk * NjNi; Kokkos::parallel_for( - "SparseDealloc", + PARTHENON_AUTO_LABEL, Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), pack.GetNBlocks(), Kokkos::AUTO), KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) { const int b = team_member.league_rank(); diff --git a/src/interface/update.hpp b/src/interface/update.hpp index d2cc37343802..dd4b8158d0b8 100644 --- a/src/interface/update.hpp +++ b/src/interface/update.hpp @@ -75,7 +75,7 @@ TaskStatus WeightedSumData(const F &flags, T *in1, T *in2, const Real w1, const const auto &y = in2->PackVariables(flags); const auto &z = out->PackVariables(flags); parthenon::par_for( - DEFAULT_LOOP_PATTERN, "WeightedSumData", DevExecSpace(), 0, x.GetDim(5) - 1, 0, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, x.GetDim(5) - 1, 0, x.GetDim(4) - 1, 0, x.GetDim(3) - 1, 0, x.GetDim(2) - 1, 0, x.GetDim(1) - 1, KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) { // TOOD(someone) This is potentially dangerous and/or not intended behavior @@ -98,7 +98,7 @@ TaskStatus SetDataToConstant(const F &flags, T *data, const Real val) { PARTHENON_INSTRUMENT const auto &x = data->PackVariables(flags); parthenon::par_for( - DEFAULT_LOOP_PATTERN, "SetDataToConstant", DevExecSpace(), 0, x.GetDim(5) - 1, 0, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, x.GetDim(5) - 1, 0, x.GetDim(4) - 1, 0, x.GetDim(3) - 1, 0, x.GetDim(2) - 1, 0, x.GetDim(1) - 1, KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) { if (x.IsAllocated(b, l)) { @@ -161,7 +161,7 @@ TaskStatus Update2S(const F &flags, T *s0_data, T *s1_data, T *rhs_data, Real gam0 = pint->gam0[stage - 1]; Real gam1 = pint->gam1[stage - 1]; parthenon::par_for( - DEFAULT_LOOP_PATTERN, "2S_Update", DevExecSpace(), 0, s0.GetDim(5) - 1, 0, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, s0.GetDim(5) - 1, 0, s0.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) { if (s0.IsAllocated(b, l) && s1.IsAllocated(b, l) && rhs.IsAllocated(b, l)) { @@ -199,7 +199,7 @@ TaskStatus SumButcher(const F &flags, std::shared_ptr base_data, const IndexRange jb = out_data->GetBoundsJ(interior); const IndexRange kb = out_data->GetBoundsK(interior); parthenon::par_for( - DEFAULT_LOOP_PATTERN, "ButcherSumInit", DevExecSpace(), 0, out.GetDim(5) - 1, 0, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, out.GetDim(5) - 1, 0, out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) { if (out.IsAllocated(b, l) && in.IsAllocated(b, l)) { @@ -210,8 +210,8 @@ TaskStatus SumButcher(const F &flags, std::shared_ptr base_data, Real a = pint->a[stage - 1][prev]; const auto &in = stage_data[stage]->PackVariables(flags); parthenon::par_for( - DEFAULT_LOOP_PATTERN, "ButcherSum", DevExecSpace(), 0, out.GetDim(5) - 1, 0, - out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, out.GetDim(5) - 1, + 0, out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) { if (out.IsAllocated(b, l) && in.IsAllocated(b, l)) { out(b, l, k, j, i) += dt * a * in(b, l, k, j, i); @@ -247,8 +247,8 @@ TaskStatus UpdateButcher(const F &flags, std::vector> stage_d const Real butcher_b = pint->b[stage]; const auto &in = stage_data[stage]->PackVariables(flags); parthenon::par_for( - DEFAULT_LOOP_PATTERN, "ButcherUpdate", DevExecSpace(), 0, out.GetDim(5) - 1, 0, - out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, out.GetDim(5) - 1, + 0, out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) { if (out.IsAllocated(b, l) && in.IsAllocated(b, l)) { out(b, l, k, j, i) += dt * b * in(b, l, k, j, i); @@ -292,19 +292,19 @@ TaskStatus FillDerived(T *rc) { PARTHENON_INSTRUMENT auto pm = rc->GetParentPointer(); { // PreFillDerived region - PARTHENON_INSTRUMENT_REGION("PreFillDerived") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) for (const auto &pkg : pm->packages.AllPackages()) { pkg.second->PreFillDerived(rc); } } // PreFillDerived region { // FillDerived region - PARTHENON_INSTRUMENT_REGION("FillDerived"); + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) for (const auto &pkg : pm->packages.AllPackages()) { pkg.second->FillDerived(rc); } } // FillDerived region { // PostFillDerived region - PARTHENON_INSTRUMENT_REGION("PostFillDerived"); + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) for (const auto &pkg : pm->packages.AllPackages()) { pkg.second->PostFillDerived(rc); } @@ -334,7 +334,7 @@ TaskStatus InitNewlyAllocatedVars(T *rc) { auto v = desc.GetPack(rc); Kokkos::parallel_for( - "Set newly allocated interior to default", + PARTHENON_AUTO_LABEL, Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), v.GetNBlocks(), Kokkos::AUTO), KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) { const int b = team_member.league_rank(); diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp index 4018d8220057..f1eff2959c40 100644 --- a/src/mesh/amr_loadbalance.cpp +++ b/src/mesh/amr_loadbalance.cpp @@ -126,8 +126,8 @@ bool TryRecvCoarseToFine(int lid_recv, int send_rank, const LogicalLocation &fin const int is = (ox1 == 0) ? 0 : (ib_int.e - ib_int.s + 1) / 2; const int idx_te = static_cast(te) % 3; parthenon::par_for( - DEFAULT_LOOP_PATTERN, "ReceiveCoarseToFineAMR", DevExecSpace(), 0, nt, 0, nu, - 0, nv, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, nt, 0, nu, 0, + nv, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int t, const int u, const int v, const int k, const int j, const int i) { cb(idx_te, t, u, v, k, j, i) = fb(idx_te, t, u, v, k + ks, j + js, i + is); @@ -219,8 +219,8 @@ bool TryRecvFineToCoarse(int lid_recv, int send_rank, const LogicalLocation &fin const int is = (ox1 == 0) ? 0 : (ib.e - ib.s + 1 - TopologicalOffsetI(te)); const int idx_te = static_cast(te) % 3; parthenon::par_for( - DEFAULT_LOOP_PATTERN, "ReceiveFineToCoarseAMR", DevExecSpace(), 0, nt, 0, nu, - 0, nv, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, nt, 0, nu, 0, + nv, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(const int t, const int u, const int v, const int k, const int j, const int i) { fb(idx_te, t, u, v, k + ks, j + js, i + is) = cb(idx_te, t, u, v, k, j, i); @@ -682,7 +682,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput int onbe = onbs + nblist[Globals::my_rank] - 1; { // Construct new list region - PARTHENON_INSTRUMENT_REGION("Construct_new_list") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) tree.GetMeshBlockList(newloc.data(), newtoold.data(), nbtotal); // create a list mapping the previous gid to the current one @@ -759,7 +759,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput // Send data from old to new blocks std::vector send_reqs; { // AMR Send region - PARTHENON_INSTRUMENT_REGION("AMR_Send") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) for (int n = onbs; n <= onbe; n++) { int nn = oldtonew[n]; LogicalLocation &oloc = loclist[n]; @@ -791,7 +791,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput // Construct a new MeshBlock list (moving the data within the MPI rank) BlockList_t new_block_list(nbe - nbs + 1); { // AMR Construct new MeshBlockList region - PARTHENON_INSTRUMENT_REGION("AMR_Construct_new_list") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) RegionSize block_size = GetBlockSize(); for (int n = nbs; n <= nbe; n++) { @@ -831,7 +831,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput // Receive the data and load into MeshBlocks { // AMR Recv and unpack data - PARTHENON_INSTRUMENT_REGION("AMR_Recv_unpack") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) bool all_received; int niter = 0; if (block_list.size() > 0) { diff --git a/src/outputs/ascent.cpp b/src/outputs/ascent.cpp index 57db543f0900..13af3b50f613 100644 --- a/src/outputs/ascent.cpp +++ b/src/outputs/ascent.cpp @@ -150,7 +150,7 @@ void AscentOutput::WriteOutputFile(Mesh *pm, ParameterInput *pin, SimTime *tm, const int njni = nj * ni; auto &ghost_mask = ghost_mask_; // redef to lambda capture class member pmb->par_for( - "Set ascent ghost mask", 0, ncells - 1, KOKKOS_LAMBDA(const int &idx) { + PARTHENON_AUTO_LABEL, 0, ncells - 1, KOKKOS_LAMBDA(const int &idx) { const int k = idx / (njni); const int j = (idx - k * njni) / ni; const int i = idx - k * njni - j * nj; diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp index 866d4096015d..1635627e1056 100644 --- a/src/outputs/parthenon_hdf5.cpp +++ b/src/outputs/parthenon_hdf5.cpp @@ -113,9 +113,9 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // WRITING ATTRIBUTES // // -------------------------------------------------------------------------------- // H5G info_group; - { // write attributes - PARTHENON_INSTRUMENT_REGION("write_attributes") { - PARTHENON_INSTRUMENT_REGION("write_input") + { // write attributes + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) { // Input section + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) // write input key-value pairs std::ostringstream oss; pin->ParameterDump(oss); @@ -129,7 +129,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // we'll need this again at the end info_group = MakeGroup(file, "/Info"); { - PARTHENON_INSTRUMENT_REGION("write_info") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) HDF5WriteAttribute("OutputFormatVersion", OUTPUT_VERSION_FORMAT, info_group); if (tm != nullptr) { @@ -190,7 +190,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // write Params { - PARTHENON_INSTRUMENT_REGION("write_params") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) const H5G params_group = MakeGroup(file, "/Params"); for (const auto &package : pm->packages.AllPackages()) { @@ -236,7 +236,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // write Blocks metadata { - PARTHENON_INSTRUMENT_REGION("write_block_metadata") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) const H5G gBlocks = MakeGroup(file, "/Blocks"); // write Xmin[ndim] for blocks @@ -270,7 +270,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // Write mesh coordinates to file { // write mesh coords - PARTHENON_INSTRUMENT_REGION("write_mesh_coords"); + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) for (const bool face : {true, false}) { const H5G gLocations = MakeGroup(file, face ? "/Locations" : "/VolumeLocations"); @@ -298,7 +298,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // Write Levels and Logical Locations with the level for each Meshblock loclist contains // levels and logical locations for all meshblocks on all ranks { - PARTHENON_INSTRUMENT_REGION("write_levels_and_locations") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) const auto &loclist = pm->GetLocList(); std::vector levels; @@ -335,7 +335,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm hsize_t num_sparse; std::unique_ptr sparse_allocated; { // write all variable data - PARTHENON_INSTRUMENT_REGION("write_all_variable_data") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) // All blocks have the same list of variable metadata that exist in the entire // simulation, but not all variables may be allocated on all blocks @@ -410,7 +410,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // for each variable we write for (auto &vinfo : all_vars_info) { - PARTHENON_INSTRUMENT_REGION("write_variable_loop") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) // not really necessary, but doesn't hurt memset(tmpData.data(), 0, tmpData.size() * sizeof(OutT)); @@ -480,7 +480,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm hsize_t index = 0; { // fill host output buffer - PARTHENON_INSTRUMENT_REGION("fill_host_output_buffer") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) // for each local mesh block for (size_t b_idx = 0; b_idx < num_blocks_local; ++b_idx) { const auto &pmb = pm->block_list[b_idx]; @@ -553,7 +553,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm } // fill host output buffer { // write variable data - PARTHENON_INSTRUMENT_REGION("write_variable_data") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) // write data to file HDF5WriteND(file, var_name, tmpData.data(), ndim, p_loc_offset, p_loc_cnt, p_glob_cnt, pl_xfer, pl_dcreate); @@ -592,7 +592,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // write SparseInfo and SparseFields (we can't write a zero-size dataset, so only write // this if we have sparse fields) if (num_sparse > 0) { - PARTHENON_INSTRUMENT_REGION("write_sparse_info") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) local_count[1] = global_count[1] = num_sparse; HDF5Write2D(file, "SparseInfo", sparse_allocated.get(), p_loc_offset, p_loc_cnt, @@ -613,7 +613,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm AllSwarmInfo swarm_info(pm->block_list, output_params.swarms, restart_); { // write particle data - PARTHENON_INSTRUMENT_REGION("write_particle_data") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) for (auto &[swname, swinfo] : swarm_info.all_info) { const H5G g_swm = MakeGroup(file, swname); // offsets/counts are NOT the same here vs the grid data @@ -676,7 +676,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm } // write particle data { // generate xdmf - PARTHENON_INSTRUMENT_REGION("genXDMF") + PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) // generate XDMF companion file XDMF::genXDMF(filename, pm, tm, nx1, nx2, nx3, all_vars_info, swarm_info); } // generate xdmf diff --git a/src/prolong_restrict/pr_loops.hpp b/src/prolong_restrict/pr_loops.hpp index 6f9cf823bc37..9dc02e9e92fe 100644 --- a/src/prolong_restrict/pr_loops.hpp +++ b/src/prolong_restrict/pr_loops.hpp @@ -123,8 +123,8 @@ ProlongationRestrictionLoop(const ProResInfoArr_t &info, const Idx_t &buffer_idx const int scratch_level = 1; // 0 is actual scratch (tiny); 1 is HBM size_t scratch_size_in_bytes = 1; par_for_outer( - DEFAULT_OUTER_LOOP_PATTERN, "ProlongateOrRestrictCellCenteredValues", - DevExecSpace(), scratch_size_in_bytes, scratch_level, 0, nbuffers - 1, + DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), + scratch_size_in_bytes, scratch_level, 0, nbuffers - 1, KOKKOS_LAMBDA(team_mbr_t team_member, const int sub_idx) { const std::size_t buf = buffer_idxs(sub_idx); if (DoRefinementOp(info(buf), op)) { @@ -157,8 +157,8 @@ InnerHostProlongationRestrictionLoop(std::size_t buf, const ProResInfoArrHost_t auto coarse = info(buf).coarse; auto fine = info(buf).fine; par_for( - DEFAULT_LOOP_PATTERN, "ProlongateOrRestrictCellCenteredValues", DevExecSpace(), 0, - 0, 0, 0, 0, idxer.size() - 1, KOKKOS_LAMBDA(const int, const int, const int ii) { + DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, 0, 0, 0, 0, + idxer.size() - 1, KOKKOS_LAMBDA(const int, const int, const int ii) { const auto [t, u, v, k, j, i] = idxer(ii); if (idxer.IsActive(k, j, i)) { Stencil::template Do(t, u, v, k, j, i, ckb, cjb, cib, kb, jb, ib, diff --git a/src/utils/buffer_utils.cpp b/src/utils/buffer_utils.cpp index 65213d9421c7..d2a2e418c83f 100644 --- a/src/utils/buffer_utils.cpp +++ b/src/utils/buffer_utils.cpp @@ -44,7 +44,7 @@ void PackData(ParArray4D &src, BufArray1D &buf, int sn, int en, int si, in const int nn = en + 1 - sn; pmb->par_for( - "PackData 4D", sn, en, sk, ek, sj, ej, si, ei, + PARTHENON_AUTO_LABEL, sn, en, sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { buf(offset + i - si + ni * (j - sj + nj * (k - sk + nk * (n - sn)))) = src(n, k, j, i); @@ -62,7 +62,7 @@ void PackZero(BufArray1D &buf, int sn, int en, int si, int ei, int sj, int ej const int nn = en + 1 - sn; pmb->par_for( - "PackZero 4D", sn, en, sk, ek, sj, ej, si, ei, + PARTHENON_AUTO_LABEL, sn, en, sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) { buf(offset + i - si + ni * (j - sj + nj * (k - sk + nk * (n - sn)))) = 0.0; }); @@ -84,7 +84,7 @@ void PackData(ParArray3D &src, BufArray1D &buf, int si, int ei, int sj, in const int nk = ek + 1 - sk; pmb->par_for( - "PackData 3D", sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int k, int j, int i) { + PARTHENON_AUTO_LABEL, sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int k, int j, int i) { buf(offset + i - si + ni * (j - sj + nj * (k - sk))) = src(k, j, i); }); @@ -107,7 +107,7 @@ void UnpackData(BufArray1D &buf, ParArray4D &dst, int sn, int en, int si, const int nn = en + 1 - sn; pmb->par_for( - "UnpackData 4D", sn, en, sk, ek, sj, ej, si, ei, + PARTHENON_AUTO_LABEL, sn, en, sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int n, int k, int j, int i) { dst(n, k, j, i) = buf(offset + i - si + ni * (j - sj + nj * (k - sk + nk * (n - sn)))); @@ -131,7 +131,7 @@ void UnpackData(BufArray1D &buf, ParArray3D &dst, int si, int ei, int sj, const int nk = ek + 1 - sk; pmb->par_for( - "UnpackData 3D", sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int k, int j, int i) { + PARTHENON_AUTO_LABEL, sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int k, int j, int i) { dst(k, j, i) = buf(offset + i - si + ni * (j - sj + nj * (k - sk))); }); diff --git a/src/utils/instrument.hpp b/src/utils/instrument.hpp index 3fa3fcb4f7c4..6e1030056141 100644 --- a/src/utils/instrument.hpp +++ b/src/utils/instrument.hpp @@ -18,24 +18,37 @@ #include #define PARTHENON_INSTRUMENT KokkosTimer internal_inst_(__FILE__, __LINE__, __func__); -#define PARTHENON_INSTRUMENT_REGION(name) \ - KokkosTimer internal_inst_reg____LINE__(__FILE__, __LINE__, name); +#define PARTHENON_INSTRUMENT_REGION(name) KokkosTimer internal_inst_reg_(name); +#define PARTHENON_AUTO_LABEL parthenon::build_auto_label(__FILE__, __LINE__, __func__) namespace parthenon { +namespace detail { +inline std::string strip_full_path(const std::string &full_path) { + std::string label; + auto npos = full_path.rfind('/'); + if (npos == std::string::npos) { + label = full_path; + } else { + label = full_path.substr(npos + 1); + } + return label; +} +} // namespace detail + +inline std::string build_auto_label(const std::string &file, const int line, + const std::string &name) { + return detail::strip_full_path(file) + "::" + std::to_string(line) + "::" + name; +} struct KokkosTimer { KokkosTimer(const std::string &file, const int line, const std::string &name) { - std::string label; - auto npos = file.rfind('/'); - if (npos == std::string::npos) { - label = file; - } else { - label = file.substr(npos + 1); - } - label += "::" + std::to_string(line) + "::" + name; - Kokkos::Profiling::pushRegion(label); + Push(build_auto_label(file, line, name)); } + KokkosTimer(const std::string &name) { Push(name); } ~KokkosTimer() { Kokkos::Profiling::popRegion(); } + + private: + void Push(const std::string &name) { Kokkos::Profiling::pushRegion(name); } }; } // namespace parthenon From 410ceb5117bf504d300c743c041e2bc10d94b135 Mon Sep 17 00:00:00 2001 From: jdolence Date: Tue, 31 Oct 2023 17:07:21 -0600 Subject: [PATCH 05/16] missed one --- src/bvals/boundary_conditions_generic.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bvals/boundary_conditions_generic.hpp b/src/bvals/boundary_conditions_generic.hpp index 9ee8cf5eba8e..76a52b899c88 100644 --- a/src/bvals/boundary_conditions_generic.hpp +++ b/src/bvals/boundary_conditions_generic.hpp @@ -88,7 +88,7 @@ void GenericBC(std::shared_ptr> &rc, bool coarse, const int offsetin = INNER; const int offsetout = !INNER; pmb->par_for_bndry( - label, nb, domain, el, coarse, + PARTHENON_AUTO_LABEL, nb, domain, el, coarse, KOKKOS_LAMBDA(const int &l, const int &k, const int &j, const int &i) { if (TYPE == BCType::Reflect) { const bool reflect = (q(b, el, l).vector_component == DIR); From 14a10606f2e2bfcd314912c209fbd765bf17f11a Mon Sep 17 00:00:00 2001 From: jdolence Date: Wed, 1 Nov 2023 09:08:33 -0600 Subject: [PATCH 06/16] fix style --- src/mesh/amr_loadbalance.cpp | 1 - src/outputs/parthenon_hdf5.cpp | 22 ++++++++++------------ src/utils/instrument.hpp | 4 ++-- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp index f1eff2959c40..0814c5ccb487 100644 --- a/src/mesh/amr_loadbalance.cpp +++ b/src/mesh/amr_loadbalance.cpp @@ -961,7 +961,6 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput for (auto &pmb : block_list) { pmb->pbval->SearchAndSetNeighbors(tree, ranklist.data(), nslist.data()); } - } // AMR Recv and unpack data ResetLoadBalanceVariables(); diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp index 1635627e1056..3e9e5d55c2e7 100644 --- a/src/outputs/parthenon_hdf5.cpp +++ b/src/outputs/parthenon_hdf5.cpp @@ -128,7 +128,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // we'll need this again at the end info_group = MakeGroup(file, "/Info"); - { + { // write info PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) HDF5WriteAttribute("OutputFormatVersion", OUTPUT_VERSION_FORMAT, info_group); @@ -186,10 +186,9 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm } HDF5WriteAttribute("BoundaryConditions", boundary_condition_str, info_group); - } // Info section + } // write info - // write Params - { + { // write params PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) const H5G params_group = MakeGroup(file, "/Params"); @@ -198,7 +197,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // Write all params that can be written as HDF5 attributes state->AllParams().WriteAllToHDF5(state->label(), params_group); } - } // Params section + } // write params } // write attributes // -------------------------------------------------------------------------------- // @@ -234,8 +233,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm PARTHENON_HDF5_CHECK(H5Pset_dxpl_mpio(pl_xfer, H5FD_MPIO_COLLECTIVE)); #endif - // write Blocks metadata - { + { // write Blocks metadata PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) const H5G gBlocks = MakeGroup(file, "/Blocks"); @@ -266,7 +264,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm HDF5Write2D(gBlocks, "loc.level-gid-lid-cnghost-gflag", tmpID.data(), p_loc_offset, p_loc_cnt, p_glob_cnt, pl_xfer); } - } // Block section + } // write Blocks metadata // Write mesh coordinates to file { // write mesh coords @@ -297,7 +295,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // Write Levels and Logical Locations with the level for each Meshblock loclist contains // levels and logical locations for all meshblocks on all ranks - { + { // write levels + log locs PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) const auto &loclist = pm->GetLocList(); @@ -325,7 +323,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // reset for collective output local_count[0] = num_blocks_local; - } + } // write levels + log locs // -------------------------------------------------------------------------------- // // WRITING VARIABLES DATA // @@ -558,8 +556,8 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm HDF5WriteND(file, var_name, tmpData.data(), ndim, p_loc_offset, p_loc_cnt, p_glob_cnt, pl_xfer, pl_dcreate); } // write variable data - } - } // write all variable data + } // for each variable + } // write all variable data // names of variables std::vector var_names; diff --git a/src/utils/instrument.hpp b/src/utils/instrument.hpp index 6e1030056141..1f3b3972d0fd 100644 --- a/src/utils/instrument.hpp +++ b/src/utils/instrument.hpp @@ -44,7 +44,7 @@ struct KokkosTimer { KokkosTimer(const std::string &file, const int line, const std::string &name) { Push(build_auto_label(file, line, name)); } - KokkosTimer(const std::string &name) { Push(name); } + explicit KokkosTimer(const std::string &name) { Push(name); } ~KokkosTimer() { Kokkos::Profiling::popRegion(); } private: @@ -53,4 +53,4 @@ struct KokkosTimer { } // namespace parthenon -#endif +#endif // UTILS_INSTRUMENT_HPP_ From e9f755e5ca5ca014a0a9dc72f348fa19401139b8 Mon Sep 17 00:00:00 2001 From: jdolence Date: Wed, 1 Nov 2023 09:33:29 -0600 Subject: [PATCH 07/16] make auto named profiling variables unique --- src/utils/instrument.hpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/utils/instrument.hpp b/src/utils/instrument.hpp index 1f3b3972d0fd..df0a072e13d6 100644 --- a/src/utils/instrument.hpp +++ b/src/utils/instrument.hpp @@ -17,9 +17,15 @@ #include -#define PARTHENON_INSTRUMENT KokkosTimer internal_inst_(__FILE__, __LINE__, __func__); -#define PARTHENON_INSTRUMENT_REGION(name) KokkosTimer internal_inst_reg_(name); +#define __UNIQUE_INST_VAR2(x, y) x##y +#define __UNIQUE_INST_VAR(x, y) __UNIQUE_INST_VAR2(x, y) +#define PARTHENON_INSTRUMENT \ + KokkosTimer __UNIQUE_INST_VAR(internal_inst, __LINE__)(__FILE__, __LINE__, __func__); +#define PARTHENON_INSTRUMENT_REGION(name) \ + KokkosTimer __UNIQUE_INST_VAR(internal_inst_reg, __LINE__)(name); #define PARTHENON_AUTO_LABEL parthenon::build_auto_label(__FILE__, __LINE__, __func__) +//#undef UNIQUE_VAR +//#undef UNIQUE_VAR2 namespace parthenon { namespace detail { From 3b70bbb08b4b9f6d1a2c38c15fc647ed5a741843 Mon Sep 17 00:00:00 2001 From: jdolence Date: Wed, 1 Nov 2023 12:17:58 -0600 Subject: [PATCH 08/16] missed a few --- src/amr_criteria/refinement_package.cpp | 11 +++++++---- src/interface/swarm.cpp | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/amr_criteria/refinement_package.cpp b/src/amr_criteria/refinement_package.cpp index bc6744cab109..57542e9e5775 100644 --- a/src/amr_criteria/refinement_package.cpp +++ b/src/amr_criteria/refinement_package.cpp @@ -59,6 +59,7 @@ AmrTag CheckAllRefinement(MeshBlockData *rc) { // 2) the code must maintain proper nesting, which sometimes means a block that is // tagged as "derefine" must be left alone (or possibly refined?) because of // neighboring blocks. Similarly for "do nothing" + PARTHENON_INSTRUMENT std::shared_ptr pmb = rc->GetBlockPointer(); // delta_level holds the max over all criteria. default to derefining. AmrTag delta_level = AmrTag::derefine; @@ -90,11 +91,12 @@ AmrTag CheckAllRefinement(MeshBlockData *rc) { AmrTag FirstDerivative(const AMRBounds &bnds, const ParArray3D &q, const Real refine_criteria, const Real derefine_criteria) { + PARTHENON_INSTRUMENT const int ndim = 1 + (bnds.je > bnds.js) + (bnds.ke > bnds.ks); Real maxd = 0.0; par_reduce( - loop_pattern_mdrange_tag, "refinement first derivative", DevExecSpace(), bnds.ks, - bnds.ke, bnds.js, bnds.je, bnds.is, bnds.ie, + loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), bnds.ks, bnds.ke, + bnds.js, bnds.je, bnds.is, bnds.ie, KOKKOS_LAMBDA(int k, int j, int i, Real &maxd) { Real scale = std::abs(q(k, j, i)); Real d = @@ -118,11 +120,12 @@ AmrTag FirstDerivative(const AMRBounds &bnds, const ParArray3D &q, AmrTag SecondDerivative(const AMRBounds &bnds, const ParArray3D &q, const Real refine_criteria, const Real derefine_criteria) { + PARTHENON_INSTRUMENT const int ndim = 1 + (bnds.je > bnds.js) + (bnds.ke > bnds.ks); Real maxd = 0.0; par_reduce( - loop_pattern_mdrange_tag, "refinement second derivative", DevExecSpace(), bnds.ks, - bnds.ke, bnds.js, bnds.je, bnds.is, bnds.ie, + loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), bnds.ks, bnds.ke, + bnds.js, bnds.je, bnds.is, bnds.ie, KOKKOS_LAMBDA(int k, int j, int i, Real &maxd) { Real aqt = std::abs(q(k, j, i)) + TINY_NUMBER; Real qavg = 0.5 * (q(k, j, i + 1) + q(k, j, i - 1)); diff --git a/src/interface/swarm.cpp b/src/interface/swarm.cpp index 981ded597589..c6106d387c02 100644 --- a/src/interface/swarm.cpp +++ b/src/interface/swarm.cpp @@ -1005,7 +1005,7 @@ void Swarm::Send(BoundaryCommSubset phase) { int total_sent_particles = 0; pmb->par_reduce( - "total sent particles", 0, max_active_index_, + PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(int n, int &total_sent_particles) { if (swarm_d.IsActive(n)) { if (!swarm_d.IsOnCurrentMeshBlock(n)) { From 220a0532a0d167a87e9dd53dbce08754cc7540b6 Mon Sep 17 00:00:00 2001 From: jdolence Date: Wed, 1 Nov 2023 12:56:44 -0600 Subject: [PATCH 09/16] add missing function profiling calls. add new badly written script to post process output from kokkos simple kernel timer. --- .../process_timer.py | 140 ++++++++++++++++++ src/prolong_restrict/pr_loops.hpp | 3 + 2 files changed, 143 insertions(+) create mode 100644 scripts/python/packages/parthenon_process_kernel_timer/process_timer.py diff --git a/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py b/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py new file mode 100644 index 000000000000..afa31bd03339 --- /dev/null +++ b/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py @@ -0,0 +1,140 @@ +# ========================================================================================= +# (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved. +# +# This program was produced under U.S. Government contract 89233218CNA000001 for Los +# Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC +# for the U.S. Department of Energy/National Nuclear Security Administration. All rights +# in the program are reserved by Triad National Security, LLC, and the U.S. Department +# of Energy/National Nuclear Security Administration. The Government is granted for +# itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide +# license in this material to reproduce, prepare derivative works, distribute copies to +# the public, perform publicly and display publicly, and to permit others to do so. +# ========================================================================================= + +""" +Process the text profiling data spit out by the kokkos simple kernel timer, arranging things +in a convenient way, assuming all the region/kernel names are auto-generated by parthenon +""" + +import sys + +class region: + def __init__(self, fl, line, data): + self.file = fl + self.line = line + self.data = data + +class func: + def __init__(self, name): + self.name = name + self.file = "" + self.reg = {} + self.reg_type = {} + self.total_time = 0.0 + self.total_pct = 0.0 + self.total_kernel_time = 0.0 + def add_region(self, fl, line, data): + if self.file == "": + self.file = fl + else: + if fl != self.file: + print("Duplicate function name found") + sys.exit(1) + dsplit = data.split() + self.reg_type[line] = dsplit[0][1:-1] + self.reg[line] = [float(dsplit[1]), float(dsplit[5].rstrip())] + if self.reg_type[line] != "REGION": + self.total_kernel_time += self.reg[line][0] + def compute_total_cost(self): + if len(self.reg) == 1: + for key in self.reg.keys(): + if self.reg_type[key] != "REGION": + self.name += " ***WARNING: no function-level profiling***" + self.total_time = self.reg[key][0] + self.total_pct = self.reg[key][1] + return + # sort by line number + keys = list(self.reg.keys()) + key_int = [int(k) for k in keys] + key_int.sort() + keys = [str(k) for k in key_int] + if self.reg_type[keys[0]] == "REGION": + # assume this is the time for the function + self.total_time = self.reg[keys[0]][0] + self.total_pct = self.reg[keys[0]][1] + else : + # assume there are just kernel timers + self.name += " ***WARNING: no function-level profiling***" + wall = 0.0 + for key,val in self.reg.items(): + if self.reg_type[key] != "REGION": + self.total_time += self.reg[key][0] + wall = self.reg[key][0]/self.reg[key][1] + self.total_pct = self.total_time / wall + def print_region(self): + print(self.name, "time: " + str(self.total_time), "%wall: " + str(self.total_pct)) + for key,val in self.reg.items(): + if val[0] != self.total_time: + print(" ", "type: " + self.reg_type[key], "line: " + key, "selftime: " + str(val[0]), "%func: " + str(100*val[0]/self.total_time)) + if self.total_time > 0: + print(" ", "Kernel summary: Total kernel time: " + str(self.total_kernel_time), " % Time in kernels: ", str(100*self.total_kernel_time/self.total_time)) + else: + print(" ", "Apparently this function took zero time to execute???") + +def parse_name(s): + if s[0:6] == "Kokkos": + f = s.rstrip() + line = "na" + fl = "na" + label = f + else: + words = s.split("::") + if words[0] == "kokkos_abstraction.hpp": + return "", "", "", "", True + # make sure it follows the filename::line_number::name convection + if ".cpp" in s or ".hpp" in s: + # now strip away the file and line + fl = s[:s.find(":")] + f = s[s.find(":")+2:] + line = f[:f.find(":")] + f = f[f.find(":")+2:] + label = (fl + "::" + f).rstrip() + else: + print("nonconforming entry", s) + sys.exit(1) + return f, line, fl, label, False + +def main(prof): + funcs = {} + with open(prof) as fp: + raw = fp.readlines() + if raw[0].rstrip() != "Regions:": + print(prof + " does not appear to be a profile from the Kokkos simple kernel timer") + print(raw[0]) + sys.exit(1) + cline = 2 + in_regions = True + # process regions/kernels + while raw[cline].rstrip() != "": + sraw = raw[cline][2::] + f, line, fl, label, skip = parse_name(sraw) + if skip: + cline += 2 + continue + if label not in funcs.keys(): + funcs[label] = func(label) + funcs[label].add_region(fl, line, raw[cline+1].rstrip()) + cline += 2 + if raw[cline].rstrip() == "" and in_regions: + cline += 4 + in_regions = False + + for key in funcs.keys(): + funcs[key].compute_total_cost() + + for key in (sorted(funcs, key=lambda name: funcs[name].total_time, reverse=True)): + funcs[key].print_region() + print() + +if __name__ == "__main__": + main(sys.argv[1]) \ No newline at end of file diff --git a/src/prolong_restrict/pr_loops.hpp b/src/prolong_restrict/pr_loops.hpp index 9dc02e9e92fe..2cb07ac4c547 100644 --- a/src/prolong_restrict/pr_loops.hpp +++ b/src/prolong_restrict/pr_loops.hpp @@ -64,6 +64,7 @@ KOKKOS_INLINE_FUNCTION void InnerProlongationRestrictionLoop( team_mbr_t &team_member, std::size_t buf, const ProResInfoArr_t &info, const IndexRange &ckb, const IndexRange &cjb, const IndexRange &cib, const IndexRange &kb, const IndexRange &jb, const IndexRange &ib) { + PARTHENON_INSTRUMENT const auto &idxer = info(buf).idxer[static_cast(CEL)]; par_for_inner( inner_loop_pattern_tvr_tag, team_member, 0, idxer.size() - 1, [&](const int ii) { @@ -113,6 +114,7 @@ inline void ProlongationRestrictionLoop(const ProResInfoArr_t &info, const Idx_t &buffer_idxs, const IndexShape &cellbounds, const IndexShape &c_cellbounds, const RefinementOp_t op, const std::size_t nbuffers) { + PARTHENON_INSTRUMENT const IndexDomain interior = IndexDomain::interior; auto ckb = c_cellbounds.GetBoundsK(interior); auto cjb = c_cellbounds.GetBoundsJ(interior); @@ -151,6 +153,7 @@ InnerHostProlongationRestrictionLoop(std::size_t buf, const ProResInfoArrHost_t const IndexRange &ckb, const IndexRange &cjb, const IndexRange &cib, const IndexRange &kb, const IndexRange &jb, const IndexRange &ib) { + PARTHENON_INSTRUMENT const auto &idxer = info(buf).idxer[static_cast(CEL)]; auto coords = info(buf).coords; auto coarse_coords = info(buf).coarse_coords; From 883323c0be50ac77a7cdf091294a3e4c33322021 Mon Sep 17 00:00:00 2001 From: jdolence Date: Wed, 1 Nov 2023 13:43:47 -0600 Subject: [PATCH 10/16] formatting --- .../process_timer.py | 244 ++++++++++-------- 1 file changed, 134 insertions(+), 110 deletions(-) diff --git a/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py b/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py index afa31bd03339..e23a40bbf72b 100644 --- a/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py +++ b/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py @@ -18,123 +18,147 @@ import sys + class region: - def __init__(self, fl, line, data): - self.file = fl - self.line = line - self.data = data + def __init__(self, fl, line, data): + self.file = fl + self.line = line + self.data = data + class func: - def __init__(self, name): - self.name = name - self.file = "" - self.reg = {} - self.reg_type = {} - self.total_time = 0.0 - self.total_pct = 0.0 - self.total_kernel_time = 0.0 - def add_region(self, fl, line, data): - if self.file == "": - self.file = fl - else: - if fl != self.file: - print("Duplicate function name found") - sys.exit(1) - dsplit = data.split() - self.reg_type[line] = dsplit[0][1:-1] - self.reg[line] = [float(dsplit[1]), float(dsplit[5].rstrip())] - if self.reg_type[line] != "REGION": - self.total_kernel_time += self.reg[line][0] - def compute_total_cost(self): - if len(self.reg) == 1: - for key in self.reg.keys(): - if self.reg_type[key] != "REGION": - self.name += " ***WARNING: no function-level profiling***" - self.total_time = self.reg[key][0] - self.total_pct = self.reg[key][1] - return - # sort by line number - keys = list(self.reg.keys()) - key_int = [int(k) for k in keys] - key_int.sort() - keys = [str(k) for k in key_int] - if self.reg_type[keys[0]] == "REGION": - # assume this is the time for the function - self.total_time = self.reg[keys[0]][0] - self.total_pct = self.reg[keys[0]][1] - else : - # assume there are just kernel timers - self.name += " ***WARNING: no function-level profiling***" - wall = 0.0 - for key,val in self.reg.items(): - if self.reg_type[key] != "REGION": - self.total_time += self.reg[key][0] - wall = self.reg[key][0]/self.reg[key][1] - self.total_pct = self.total_time / wall - def print_region(self): - print(self.name, "time: " + str(self.total_time), "%wall: " + str(self.total_pct)) - for key,val in self.reg.items(): - if val[0] != self.total_time: - print(" ", "type: " + self.reg_type[key], "line: " + key, "selftime: " + str(val[0]), "%func: " + str(100*val[0]/self.total_time)) - if self.total_time > 0: - print(" ", "Kernel summary: Total kernel time: " + str(self.total_kernel_time), " % Time in kernels: ", str(100*self.total_kernel_time/self.total_time)) - else: - print(" ", "Apparently this function took zero time to execute???") + def __init__(self, name): + self.name = name + self.file = "" + self.reg = {} + self.reg_type = {} + self.total_time = 0.0 + self.total_pct = 0.0 + self.total_kernel_time = 0.0 + + def add_region(self, fl, line, data): + if self.file == "": + self.file = fl + else: + if fl != self.file: + print("Duplicate function name found") + sys.exit(1) + dsplit = data.split() + self.reg_type[line] = dsplit[0][1:-1] + self.reg[line] = [float(dsplit[1]), float(dsplit[5].rstrip())] + if self.reg_type[line] != "REGION": + self.total_kernel_time += self.reg[line][0] + + def compute_total_cost(self): + if len(self.reg) == 1: + for key in self.reg.keys(): + if self.reg_type[key] != "REGION": + self.name += " ***WARNING: no function-level profiling***" + self.total_time = self.reg[key][0] + self.total_pct = self.reg[key][1] + return + # sort by line number + keys = list(self.reg.keys()) + key_int = [int(k) for k in keys] + key_int.sort() + keys = [str(k) for k in key_int] + if self.reg_type[keys[0]] == "REGION": + # assume this is the time for the function + self.total_time = self.reg[keys[0]][0] + self.total_pct = self.reg[keys[0]][1] + else: + # assume there are just kernel timers + self.name += " ***WARNING: no function-level profiling***" + wall = 0.0 + for key, val in self.reg.items(): + if self.reg_type[key] != "REGION": + self.total_time += self.reg[key][0] + wall = self.reg[key][0] / self.reg[key][1] + self.total_pct = self.total_time / wall + + def print_region(self): + print( + self.name, "time: " + str(self.total_time), "%wall: " + str(self.total_pct) + ) + for key, val in self.reg.items(): + if val[0] != self.total_time: + print( + " ", + "type: " + self.reg_type[key], + "line: " + key, + "selftime: " + str(val[0]), + "%func: " + str(100 * val[0] / self.total_time), + ) + if self.total_time > 0: + print( + " ", + "Kernel summary: Total kernel time: " + str(self.total_kernel_time), + " % Time in kernels: ", + str(100 * self.total_kernel_time / self.total_time), + ) + else: + print(" ", "Apparently this function took zero time to execute???") + def parse_name(s): - if s[0:6] == "Kokkos": - f = s.rstrip() - line = "na" - fl = "na" - label = f - else: - words = s.split("::") - if words[0] == "kokkos_abstraction.hpp": - return "", "", "", "", True - # make sure it follows the filename::line_number::name convection - if ".cpp" in s or ".hpp" in s: - # now strip away the file and line - fl = s[:s.find(":")] - f = s[s.find(":")+2:] - line = f[:f.find(":")] - f = f[f.find(":")+2:] - label = (fl + "::" + f).rstrip() + if s[0:6] == "Kokkos": + f = s.rstrip() + line = "na" + fl = "na" + label = f else: - print("nonconforming entry", s) - sys.exit(1) - return f, line, fl, label, False + words = s.split("::") + if words[0] == "kokkos_abstraction.hpp": + return "", "", "", "", True + # make sure it follows the filename::line_number::name convection + if ".cpp" in s or ".hpp" in s: + # now strip away the file and line + fl = s[: s.find(":")] + f = s[s.find(":") + 2 :] + line = f[: f.find(":")] + f = f[f.find(":") + 2 :] + label = (fl + "::" + f).rstrip() + else: + print("nonconforming entry", s) + sys.exit(1) + return f, line, fl, label, False + def main(prof): - funcs = {} - with open(prof) as fp: - raw = fp.readlines() - if raw[0].rstrip() != "Regions:": - print(prof + " does not appear to be a profile from the Kokkos simple kernel timer") - print(raw[0]) - sys.exit(1) - cline = 2 - in_regions = True - # process regions/kernels - while raw[cline].rstrip() != "": - sraw = raw[cline][2::] - f, line, fl, label, skip = parse_name(sraw) - if skip: - cline += 2 - continue - if label not in funcs.keys(): - funcs[label] = func(label) - funcs[label].add_region(fl, line, raw[cline+1].rstrip()) - cline += 2 - if raw[cline].rstrip() == "" and in_regions: - cline += 4 - in_regions = False - - for key in funcs.keys(): - funcs[key].compute_total_cost() - - for key in (sorted(funcs, key=lambda name: funcs[name].total_time, reverse=True)): - funcs[key].print_region() - print() + funcs = {} + with open(prof) as fp: + raw = fp.readlines() + if raw[0].rstrip() != "Regions:": + print( + prof + + " does not appear to be a profile from the Kokkos simple kernel timer" + ) + print(raw[0]) + sys.exit(1) + cline = 2 + in_regions = True + # process regions/kernels + while raw[cline].rstrip() != "": + sraw = raw[cline][2::] + f, line, fl, label, skip = parse_name(sraw) + if skip: + cline += 2 + continue + if label not in funcs.keys(): + funcs[label] = func(label) + funcs[label].add_region(fl, line, raw[cline + 1].rstrip()) + cline += 2 + if raw[cline].rstrip() == "" and in_regions: + cline += 4 + in_regions = False + + for key in funcs.keys(): + funcs[key].compute_total_cost() + + for key in sorted(funcs, key=lambda name: funcs[name].total_time, reverse=True): + funcs[key].print_region() + print() + if __name__ == "__main__": - main(sys.argv[1]) \ No newline at end of file + main(sys.argv[1]) From 243dd3228309806a33063713f2f2c43203d89826 Mon Sep 17 00:00:00 2001 From: jdolence Date: Wed, 1 Nov 2023 14:20:35 -0600 Subject: [PATCH 11/16] just use PARTHENON_INSTRUMENT where possible --- src/driver/driver.cpp | 2 +- src/interface/update.hpp | 6 +++--- src/mesh/amr_loadbalance.cpp | 8 ++++---- src/outputs/parthenon_hdf5.cpp | 30 +++++++++++++++--------------- src/utils/instrument.hpp | 2 -- 5 files changed, 23 insertions(+), 25 deletions(-) diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp index 166586003bc4..b12a8dde3f9b 100644 --- a/src/driver/driver.cpp +++ b/src/driver/driver.cpp @@ -76,7 +76,7 @@ DriverStatus EvolutionDriver::Execute() { DumpInputParameters(); { // Main t < tmax loop region - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT while (tm.KeepGoing()) { if (Globals::my_rank == 0) OutputCycleDiagnostics(); diff --git a/src/interface/update.hpp b/src/interface/update.hpp index dd4b8158d0b8..21f035caefa3 100644 --- a/src/interface/update.hpp +++ b/src/interface/update.hpp @@ -292,19 +292,19 @@ TaskStatus FillDerived(T *rc) { PARTHENON_INSTRUMENT auto pm = rc->GetParentPointer(); { // PreFillDerived region - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT for (const auto &pkg : pm->packages.AllPackages()) { pkg.second->PreFillDerived(rc); } } // PreFillDerived region { // FillDerived region - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT for (const auto &pkg : pm->packages.AllPackages()) { pkg.second->FillDerived(rc); } } // FillDerived region { // PostFillDerived region - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT for (const auto &pkg : pm->packages.AllPackages()) { pkg.second->PostFillDerived(rc); } diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp index 0814c5ccb487..1e76d17521b3 100644 --- a/src/mesh/amr_loadbalance.cpp +++ b/src/mesh/amr_loadbalance.cpp @@ -682,7 +682,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput int onbe = onbs + nblist[Globals::my_rank] - 1; { // Construct new list region - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT tree.GetMeshBlockList(newloc.data(), newtoold.data(), nbtotal); // create a list mapping the previous gid to the current one @@ -759,7 +759,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput // Send data from old to new blocks std::vector send_reqs; { // AMR Send region - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT for (int n = onbs; n <= onbe; n++) { int nn = oldtonew[n]; LogicalLocation &oloc = loclist[n]; @@ -791,7 +791,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput // Construct a new MeshBlock list (moving the data within the MPI rank) BlockList_t new_block_list(nbe - nbs + 1); { // AMR Construct new MeshBlockList region - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT RegionSize block_size = GetBlockSize(); for (int n = nbs; n <= nbe; n++) { @@ -831,7 +831,7 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput // Receive the data and load into MeshBlocks { // AMR Recv and unpack data - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT bool all_received; int niter = 0; if (block_list.size() > 0) { diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp index 3e9e5d55c2e7..e15a038fae86 100644 --- a/src/outputs/parthenon_hdf5.cpp +++ b/src/outputs/parthenon_hdf5.cpp @@ -113,9 +113,9 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // WRITING ATTRIBUTES // // -------------------------------------------------------------------------------- // H5G info_group; - { // write attributes - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) { // Input section - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + { // write attributes + PARTHENON_INSTRUMENT { // Input section + PARTHENON_INSTRUMENT // write input key-value pairs std::ostringstream oss; pin->ParameterDump(oss); @@ -129,7 +129,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // we'll need this again at the end info_group = MakeGroup(file, "/Info"); { // write info - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT HDF5WriteAttribute("OutputFormatVersion", OUTPUT_VERSION_FORMAT, info_group); if (tm != nullptr) { @@ -189,7 +189,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm } // write info { // write params - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT const H5G params_group = MakeGroup(file, "/Params"); for (const auto &package : pm->packages.AllPackages()) { @@ -234,7 +234,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm #endif { // write Blocks metadata - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT const H5G gBlocks = MakeGroup(file, "/Blocks"); // write Xmin[ndim] for blocks @@ -268,7 +268,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // Write mesh coordinates to file { // write mesh coords - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT for (const bool face : {true, false}) { const H5G gLocations = MakeGroup(file, face ? "/Locations" : "/VolumeLocations"); @@ -296,7 +296,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // Write Levels and Logical Locations with the level for each Meshblock loclist contains // levels and logical locations for all meshblocks on all ranks { // write levels + log locs - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT const auto &loclist = pm->GetLocList(); std::vector levels; @@ -333,7 +333,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm hsize_t num_sparse; std::unique_ptr sparse_allocated; { // write all variable data - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT // All blocks have the same list of variable metadata that exist in the entire // simulation, but not all variables may be allocated on all blocks @@ -408,7 +408,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // for each variable we write for (auto &vinfo : all_vars_info) { - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT // not really necessary, but doesn't hurt memset(tmpData.data(), 0, tmpData.size() * sizeof(OutT)); @@ -478,7 +478,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm hsize_t index = 0; { // fill host output buffer - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT // for each local mesh block for (size_t b_idx = 0; b_idx < num_blocks_local; ++b_idx) { const auto &pmb = pm->block_list[b_idx]; @@ -551,7 +551,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm } // fill host output buffer { // write variable data - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT // write data to file HDF5WriteND(file, var_name, tmpData.data(), ndim, p_loc_offset, p_loc_cnt, p_glob_cnt, pl_xfer, pl_dcreate); @@ -590,7 +590,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm // write SparseInfo and SparseFields (we can't write a zero-size dataset, so only write // this if we have sparse fields) if (num_sparse > 0) { - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT local_count[1] = global_count[1] = num_sparse; HDF5Write2D(file, "SparseInfo", sparse_allocated.get(), p_loc_offset, p_loc_cnt, @@ -611,7 +611,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm AllSwarmInfo swarm_info(pm->block_list, output_params.swarms, restart_); { // write particle data - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT for (auto &[swname, swinfo] : swarm_info.all_info) { const H5G g_swm = MakeGroup(file, swname); // offsets/counts are NOT the same here vs the grid data @@ -674,7 +674,7 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm } // write particle data { // generate xdmf - PARTHENON_INSTRUMENT_REGION(PARTHENON_AUTO_LABEL) + PARTHENON_INSTRUMENT // generate XDMF companion file XDMF::genXDMF(filename, pm, tm, nx1, nx2, nx3, all_vars_info, swarm_info); } // generate xdmf diff --git a/src/utils/instrument.hpp b/src/utils/instrument.hpp index df0a072e13d6..9f5e9aaecc77 100644 --- a/src/utils/instrument.hpp +++ b/src/utils/instrument.hpp @@ -24,8 +24,6 @@ #define PARTHENON_INSTRUMENT_REGION(name) \ KokkosTimer __UNIQUE_INST_VAR(internal_inst_reg, __LINE__)(name); #define PARTHENON_AUTO_LABEL parthenon::build_auto_label(__FILE__, __LINE__, __func__) -//#undef UNIQUE_VAR -//#undef UNIQUE_VAR2 namespace parthenon { namespace detail { From 57ffd1db945c5d61c00af149c41f240c381d9a99 Mon Sep 17 00:00:00 2001 From: jdolence Date: Wed, 1 Nov 2023 14:25:58 -0600 Subject: [PATCH 12/16] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 181a58b2698c..64e6fa9bf5c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Current develop ### Added (new features/APIs/variables/...) +- [[PR 969]](https://github.com/parthenon-hpc-lab/parthenon/pull/969) New macro-based auto-naming of profiling regions and kernels - [[PR 907]](https://github.com/parthenon-hpc-lab/parthenon/pull/907) PEP1: Allow subclassing StateDescriptor - [[PR 932]](https://github.com/parthenon-hpc-lab/parthenon/pull/932) Add GetOrAddFlag to metadata - [[PR 931]](https://github.com/parthenon-hpc-lab/parthenon/pull/931) Allow SparsePacks with subsets of blocks From e1c960bd2718b50dfd1f5e96fdc987fc54c97ad0 Mon Sep 17 00:00:00 2001 From: jdolence Date: Wed, 1 Nov 2023 14:47:14 -0600 Subject: [PATCH 13/16] oops, tried to instrument a host-device function --- src/prolong_restrict/pr_loops.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/prolong_restrict/pr_loops.hpp b/src/prolong_restrict/pr_loops.hpp index 2cb07ac4c547..288b18ed60a6 100644 --- a/src/prolong_restrict/pr_loops.hpp +++ b/src/prolong_restrict/pr_loops.hpp @@ -64,7 +64,6 @@ KOKKOS_INLINE_FUNCTION void InnerProlongationRestrictionLoop( team_mbr_t &team_member, std::size_t buf, const ProResInfoArr_t &info, const IndexRange &ckb, const IndexRange &cjb, const IndexRange &cib, const IndexRange &kb, const IndexRange &jb, const IndexRange &ib) { - PARTHENON_INSTRUMENT const auto &idxer = info(buf).idxer[static_cast(CEL)]; par_for_inner( inner_loop_pattern_tvr_tag, team_member, 0, idxer.size() - 1, [&](const int ii) { From 78e612c60672d4d39458a9e290613a8ecf1a08b9 Mon Sep 17 00:00:00 2001 From: jdolence Date: Tue, 9 Jan 2024 17:59:10 -0700 Subject: [PATCH 14/16] formatting --- src/mesh/amr_loadbalance.cpp | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp index edf20820517d..5c3f847f1e36 100644 --- a/src/mesh/amr_loadbalance.cpp +++ b/src/mesh/amr_loadbalance.cpp @@ -915,14 +915,14 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput for (auto &var : pmb->vars_cc_) { prolongation_cache.RegisterRegionHost( iprolong++, - ProResInfo::GetInteriorProlongate(pmb.get(), NeighborBlock(), var), var.get(), - resolved_packages.get()); + ProResInfo::GetInteriorProlongate(pmb.get(), NeighborBlock(), var), + var.get(), resolved_packages.get()); } } prolongation_cache.CopyToDevice(); } refinement::ProlongateShared(resolved_packages.get(), prolongation_cache, - block_list[0]->cellbounds, block_list[0]->c_cellbounds); + block_list[0]->cellbounds, block_list[0]->c_cellbounds); // update the lists loclist = std::move(newloc); @@ -943,11 +943,11 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput newly_refined); } // Make sure all old sends/receives are done before we reconfigure the mesh - #ifdef MPI_PARALLEL +#ifdef MPI_PARALLEL if (send_reqs.size() != 0) PARTHENON_MPI_CHECK( MPI_Waitall(send_reqs.size(), send_reqs.data(), MPI_STATUSES_IGNORE)); - #endif +#endif // Re-initialize the mesh with our temporary ownership/neighbor configurations. // No buffers are different when we switch to the final precedence order. SetSameLevelNeighbors(block_list, leaf_grid_locs, this->GetRootGridInfo(), nbs, false, @@ -955,14 +955,16 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput BuildGMGHierarchy(nbs, pin, app_in); Initialize(false, pin, app_in); - // Internal refinement relies on the fine shared values, which are only consistent after - // being updated with any previously fine versions + // Internal refinement relies on the fine shared values, which are only consistent + // after being updated with any previously fine versions refinement::ProlongateInternal(resolved_packages.get(), prolongation_cache, - block_list[0]->cellbounds, block_list[0]->c_cellbounds); + block_list[0]->cellbounds, + block_list[0]->c_cellbounds); - // Rebuild just the ownership model, this time weighting the "new" fine blocks just like - // any other blocks at their level. - SetSameLevelNeighbors(block_list, leaf_grid_locs, this->GetRootGridInfo(), nbs, false); + // Rebuild just the ownership model, this time weighting the "new" fine blocks just + // like any other blocks at their level. + SetSameLevelNeighbors(block_list, leaf_grid_locs, this->GetRootGridInfo(), nbs, + false); for (auto &pmb : block_list) { pmb->pbval->SearchAndSetNeighbors(this, tree, ranklist.data(), nslist.data()); } From 20a5007d95055bbcd51bb67a59eec1f67ecf2071 Mon Sep 17 00:00:00 2001 From: jdolence Date: Wed, 10 Jan 2024 09:59:50 -0700 Subject: [PATCH 15/16] address review comments --- doc/sphinx/src/instrumentation.rst | 29 +++++++++++++++++++++++++++ src/driver/driver.cpp | 32 ++++++++++-------------------- src/utils/instrument.hpp | 21 +++++++------------- 3 files changed, 46 insertions(+), 36 deletions(-) create mode 100644 doc/sphinx/src/instrumentation.rst diff --git a/doc/sphinx/src/instrumentation.rst b/doc/sphinx/src/instrumentation.rst new file mode 100644 index 000000000000..21cd08f56613 --- /dev/null +++ b/doc/sphinx/src/instrumentation.rst @@ -0,0 +1,29 @@ +.. _instrumentation: + +Performance Instrumentation +=========================== + +Parthenon provides several macros that make instrumenting your code simple. For now, +these macros instantiate Kokkos profiling regions via calls to +``Kokkos::Profiling::pushRegion`` and ``Kokkos::Profiling::popRegion``, meaning all the +Kokkos profiling tools should work straightforwardly with Parthenon-based applications. + +- ``PARTHENON_INSTRUMENT``: Instantiates an object that pushes a profiling region on +construction and pops the region on destruction. The name of the region is +auto-generated and takes the form ``"file_name::line_number::function_name"``. The region +being profiled is controlled by invoking the macro at the appropriate scope. +- ``PARTHENON_INSTRUMENT_REGION(name)``: Same as ``PARTHENON_INSTRUMENT``, but uses the +provided name instead of the auto-generated name. +- ``PARTHENON_INSTRUMENT_REGION_PUSH``: A trivial wrapper around ``pushRegion`` where +the name is auto-generated as above. +- ``PARTHENON_INSTRUMENT_REGION_POP``: A trivial wrapper around ``popRegion``. + +In addition to these macros, Parthenon provides the ``PARTHENON_AUTO_LABEL`` macro which +can be used to provide a label to kernels (e.g. through the various ``par_for`` +functions). The auto-generated name is the same as was described above. + +Though not required, the use of the auto-generated names is highly recommended. In +addition to avoiding possible name collisions, the auto-generated names provide a simple +structure that is amenable to post-processing profiling results to ease analysis. For +example, the ``process_timer.py`` script that ships with Parthenon post-processes the +results of the Kokkos simple kernel timer output to provide a convenient view of the data. \ No newline at end of file diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp index 6f96966b8bf4..f384a5503eda 100644 --- a/src/driver/driver.cpp +++ b/src/driver/driver.cpp @@ -67,40 +67,28 @@ DriverStatus EvolutionDriver::Execute() { SetGlobalTimeStep(); // Before loop do work - // App input version - Kokkos::Profiling::pushRegion("Driver_UserWorkBeforeLoop"); - if (app_input->UserWorkBeforeLoop != nullptr) { - app_input->UserWorkBeforeLoop(pmesh, pinput, tm); - } - // packages version - for (auto &[name, pkg] : pmesh->packages.AllPackages()) { - pkg->UserWorkBeforeLoop(pmesh, pinput, tm); - } - Kokkos::Profiling::popRegion(); // Driver_UserWorkBeforeLoop - - OutputSignal signal = OutputSignal::none; - pouts->MakeOutputs(pmesh, pinput, &tm, signal); - pmesh->mbcnt = 0; - int perf_cycle_offset = - pinput->GetOrAddInteger("parthenon/time", "perf_cycle_offset", 0); - - // Output a text file of all parameters at this point - // Defaults must be set across all ranks - DumpInputParameters(); - { // UserWorkBeforeLoop PARTHENON_INSTRUMENT // App input version if (app_input->UserWorkBeforeLoop != nullptr) { app_input->UserWorkBeforeLoop(pmesh, pinput, tm); } - // packages version for (auto &[name, pkg] : pmesh->packages.AllPackages()) { pkg->UserWorkBeforeLoop(pmesh, pinput, tm); } } // UserWorkBeforeLoop + OutputSignal signal = OutputSignal::none; + pouts->MakeOutputs(pmesh, pinput, &tm, signal); + pmesh->mbcnt = 0; + int perf_cycle_offset = + pinput->GetOrAddInteger("parthenon/time", "perf_cycle_offset", 0); + + // Output a text file of all parameters at this point + // Defaults must be set across all ranks + DumpInputParameters(); + { // Main t < tmax loop region PARTHENON_INSTRUMENT while (tm.KeepGoing()) { diff --git a/src/utils/instrument.hpp b/src/utils/instrument.hpp index 9f5e9aaecc77..17764657f5ca 100644 --- a/src/utils/instrument.hpp +++ b/src/utils/instrument.hpp @@ -23,25 +23,18 @@ KokkosTimer __UNIQUE_INST_VAR(internal_inst, __LINE__)(__FILE__, __LINE__, __func__); #define PARTHENON_INSTRUMENT_REGION(name) \ KokkosTimer __UNIQUE_INST_VAR(internal_inst_reg, __LINE__)(name); +#define PARTHENON_INSTRUMENT_REGION_PUSH \ + Kokkos::Profiling::pushRegion(build_auto_label(__FILE__, __LINE__, __func__)); +#define PARTHENON_INSTRUMENT_REGION_POP Kokkos::Profiling::popRegion(); #define PARTHENON_AUTO_LABEL parthenon::build_auto_label(__FILE__, __LINE__, __func__) namespace parthenon { -namespace detail { -inline std::string strip_full_path(const std::string &full_path) { - std::string label; - auto npos = full_path.rfind('/'); - if (npos == std::string::npos) { - label = full_path; - } else { - label = full_path.substr(npos + 1); - } - return label; -} -} // namespace detail -inline std::string build_auto_label(const std::string &file, const int line, +inline std::string build_auto_label(const std::string &fullpath, const int line, const std::string &name) { - return detail::strip_full_path(file) + "::" + std::to_string(line) + "::" + name; + size_t pos = fullpath.find_last_of("/\\"); + std::string file = (pos != std::string::npos ? fullpath.substr(pos + 1) : fullpath); + return file + "::" + std::to_string(line) + "::" + name; } struct KokkosTimer { From decffd504ae4c0c00999a9d084e5325f521ae7da Mon Sep 17 00:00:00 2001 From: jdolence Date: Wed, 10 Jan 2024 10:34:12 -0700 Subject: [PATCH 16/16] fix formatting of doc --- doc/sphinx/src/instrumentation.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/sphinx/src/instrumentation.rst b/doc/sphinx/src/instrumentation.rst index 21cd08f56613..14f8c643cf74 100644 --- a/doc/sphinx/src/instrumentation.rst +++ b/doc/sphinx/src/instrumentation.rst @@ -9,13 +9,13 @@ these macros instantiate Kokkos profiling regions via calls to Kokkos profiling tools should work straightforwardly with Parthenon-based applications. - ``PARTHENON_INSTRUMENT``: Instantiates an object that pushes a profiling region on -construction and pops the region on destruction. The name of the region is -auto-generated and takes the form ``"file_name::line_number::function_name"``. The region -being profiled is controlled by invoking the macro at the appropriate scope. + construction and pops the region on destruction. The name of the region is + auto-generated and takes the form ``"file_name::line_number::function_name"``. The region + being profiled is controlled by invoking the macro at the appropriate scope. - ``PARTHENON_INSTRUMENT_REGION(name)``: Same as ``PARTHENON_INSTRUMENT``, but uses the -provided name instead of the auto-generated name. + provided name instead of the auto-generated name. - ``PARTHENON_INSTRUMENT_REGION_PUSH``: A trivial wrapper around ``pushRegion`` where -the name is auto-generated as above. + the name is auto-generated as above. - ``PARTHENON_INSTRUMENT_REGION_POP``: A trivial wrapper around ``popRegion``. In addition to these macros, Parthenon provides the ``PARTHENON_AUTO_LABEL`` macro which