parthenon-hpc-lab · pgrete · Jan 12, 2024 · Oct 31, 2023 · Oct 31, 2023 · Oct 31, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## Current develop
 
 ### Added (new features/APIs/variables/...)
+- [[PR 969]](https://github.com/parthenon-hpc-lab/parthenon/pull/969) New macro-based auto-naming of profiling regions and kernels
 - [[PR 981]](https://github.com/parthenon-hpc-lab/parthenon/pull/981) Add IndexSplit
 - [[PR 983]](https://github.com/parthenon-hpc-lab/parthenon/pull/983) Add Contains to SparsePack
 - [[PR 968]](https://github.com/parthenon-hpc-lab/parthenon/pull/968) Add per package registration of boundary conditions

diff --git a/benchmarks/burgers/burgers_package.cpp b/benchmarks/burgers/burgers_package.cpp
@@ -152,7 +152,7 @@ void CalculateDerived(MeshData<Real> *md) {
   size_t scratch_size = 0;
   constexpr int scratch_level = 0;
   parthenon::par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "CalculateDerived", DevExecSpace(), scratch_size,
+      DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), scratch_size,
       scratch_level, 0, nblocks - 1, kb.s, kb.e, jb.s, jb.e,
       KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k, const int j) {
         Real *out = &v(b, 0, k, j, 0);
@@ -169,7 +169,7 @@ void CalculateDerived(MeshData<Real> *md) {
 
 // provide the routine that estimates a stable timestep for this package
 Real EstimateTimestepMesh(MeshData<Real> *md) {
-  Kokkos::Profiling::pushRegion("Task_burgers_EstimateTimestepMesh");
+  PARTHENON_INSTRUMENT
   Mesh *pm = md->GetMeshPointer();
   IndexRange ib = md->GetBoundsI(IndexDomain::interior);
   IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
@@ -197,14 +197,13 @@ Real EstimateTimestepMesh(MeshData<Real> *md) {
       },
       Kokkos::Min<Real>(min_dt));
 
-  Kokkos::Profiling::popRegion(); // Task_burgers_EstimateTimestepMesh
   return cfl * min_dt;
 }
 
 TaskStatus CalculateFluxes(MeshData<Real> *md) {
   using parthenon::ScratchPad1D;
   using parthenon::team_mbr_t;
-  Kokkos::Profiling::pushRegion("Task_burgers_CalculateFluxes");
+  PARTHENON_INSTRUMENT
 
   auto pm = md->GetParentPointer();
   const int ndim = pm->ndim;
@@ -236,7 +235,7 @@ TaskStatus CalculateFluxes(MeshData<Real> *md) {
   size_t scratch_size = 0;
   constexpr int scratch_level = 0;
   parthenon::par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "burgers::reconstruction", DevExecSpace(), scratch_size,
+      DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), scratch_size,
       scratch_level, 0, nblocks - 1, kb.s - dk, kb.e + dk, jb.s - dj, jb.e + dj,
       KOKKOS_LAMBDA(team_mbr_t member, const int b, const int k, const int j) {
         bool xrec = (k >= kb.s && k <= kb.e) && (j >= jb.s && j <= jb.e);
@@ -307,7 +306,7 @@ TaskStatus CalculateFluxes(MeshData<Real> *md) {
   // now we'll solve the Riemann problems to get fluxes
   scratch_size = 2 * ScratchPad1D<Real>::shmem_size(ib.e + 1);
   parthenon::par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "burgers::reconstruction", DevExecSpace(), scratch_size,
+      DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), scratch_size,
       scratch_level, 0, nblocks - 1, kb.s, kb.e + dk, jb.s, jb.e + dj,
       KOKKOS_LAMBDA(team_mbr_t member, const int b, const int k, const int j) {
         bool xflux = (k <= kb.e && j <= jb.e);
@@ -402,7 +401,6 @@ TaskStatus CalculateFluxes(MeshData<Real> *md) {
         }
       });
 
-  Kokkos::Profiling::popRegion(); // Task_burgers_CalculateFluxes
   return TaskStatus::complete;
 }
 

diff --git a/benchmarks/burgers/parthenon_app_inputs.cpp b/benchmarks/burgers/parthenon_app_inputs.cpp
@@ -53,7 +53,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   const auto num_vars = q.GetDim(4);
 
   pmb->par_for(
-      "Burgers::ProblemGenerator", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         const Real x = coords.Xc<1>(i);
         const Real y = coords.Xc<2>(j);

diff --git a/doc/sphinx/src/instrumentation.rst b/doc/sphinx/src/instrumentation.rst
@@ -0,0 +1,29 @@
+.. _instrumentation:
+
+Performance Instrumentation
+===========================
+
+Parthenon provides several macros that make instrumenting your code simple.  For now,
+these macros instantiate Kokkos profiling regions via calls to
+``Kokkos::Profiling::pushRegion`` and ``Kokkos::Profiling::popRegion``, meaning all the
+Kokkos profiling tools should work straightforwardly with Parthenon-based applications.
+
+- ``PARTHENON_INSTRUMENT``: Instantiates an object that pushes a profiling region on
+  construction and pops the region on destruction.  The name of the region is
+  auto-generated and takes the form ``"file_name::line_number::function_name"``.  The region
+  being profiled is controlled by invoking the macro at the appropriate scope.
+- ``PARTHENON_INSTRUMENT_REGION(name)``: Same as ``PARTHENON_INSTRUMENT``, but uses the
+  provided name instead of the auto-generated name.
+- ``PARTHENON_INSTRUMENT_REGION_PUSH``: A trivial wrapper around ``pushRegion`` where
+  the name is auto-generated as above.
+- ``PARTHENON_INSTRUMENT_REGION_POP``: A trivial wrapper around ``popRegion``.
+
+In addition to these macros, Parthenon provides the ``PARTHENON_AUTO_LABEL`` macro which
+can be used to provide a label to kernels (e.g. through the various ``par_for``
+functions).  The auto-generated name is the same as was described above.
+
+Though not required, the use of the auto-generated names is highly recommended.  In
+addition to avoiding possible name collisions, the auto-generated names provide a simple
+structure that is amenable to post-processing profiling results to ease analysis.  For
+example, the ``process_timer.py`` script that ships with Parthenon post-processes the
+results of the Kokkos simple kernel timer output to provide a convenient view of the data.
diff --git a/example/advection/advection_package.cpp b/example/advection/advection_package.cpp
@@ -248,8 +248,7 @@ AmrTag CheckRefinement(MeshBlockData<Real> *rc) {
 
   typename Kokkos::MinMax<Real>::value_type minmax;
   pmb->par_reduce(
-      "advection check refinement", 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-      ib.e,
+      PARTHENON_AUTO_LABEL, 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i,
                     typename Kokkos::MinMax<Real>::value_type &lminmax) {
         lminmax.min_val =
@@ -287,7 +286,7 @@ void PreFill(MeshBlockData<Real> *rc) {
     const int out = imap.get("one_minus_advected").first;
     const auto num_vars = rc->Get("advected").data.GetDim(4);
     pmb->par_for(
-        "advection_package::PreFill", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
           v(out + n, k, j, i) = 1.0 - v(in + n, k, j, i);
         });
@@ -311,7 +310,7 @@ void SquareIt(MeshBlockData<Real> *rc) {
   const int out = imap.get("one_minus_advected_sq").first;
   const auto num_vars = rc->Get("advected").data.GetDim(4);
   pmb->par_for(
-      "advection_package::SquareIt", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
         v(out + n, k, j, i) = v(in + n, k, j, i) * v(in + n, k, j, i);
       });
@@ -328,8 +327,8 @@ void SquareIt(MeshBlockData<Real> *rc) {
   if (profile == "smooth_gaussian") {
     const auto &advected = rc->Get("advected").data;
     pmb->par_for(
-        "advection_package::SquareIt bval check", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e,
-        ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
+        PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
           PARTHENON_REQUIRE(advected(n, k, j, i) != 0.0,
                             "Advected not properly initialized.");
         });
@@ -364,8 +363,8 @@ void PostFill(MeshBlockData<Real> *rc) {
     const int out37 = imap.get("one_minus_sqrt_one_minus_advected_sq_37").first;
     const auto num_vars = rc->Get("advected").data.GetDim(4);
     pmb->par_for(
-        "advection_package::PostFill", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-        ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
+        PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
           v(out12 + n, k, j, i) = 1.0 - sqrt(v(in + n, k, j, i));
           v(out37 + n, k, j, i) = 1.0 - v(out12 + n, k, j, i);
         });
@@ -398,7 +397,8 @@ Real AdvectionHst(MeshData<Real> *md) {
   const bool volume_weighting = std::is_same<T, Kokkos::Sum<Real, HostExecSpace>>::value;
 
   pmb->par_reduce(
-      "AdvectionHst", 0, advected_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, 0, advected_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
+      ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lresult) {
         const auto &coords = advected_pack.GetCoords(b);
         // `join` is a function of the Kokkos::ReducerConecpt that allows to use the same
@@ -429,7 +429,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
   // this is obviously overkill for this constant velocity problem
   Real min_dt;
   pmb->par_reduce(
-      "advection_package::EstimateTimestep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i, Real &lmin_dt) {
         if (vx != 0.0)
           lmin_dt = std::min(lmin_dt, coords.Dxc<X1DIR>(k, j, i) / std::abs(vx));
@@ -449,7 +449,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
 TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   using parthenon::MetadataFlag;
 
-  Kokkos::Profiling::pushRegion("Task_Advection_CalculateFluxes");
+  PARTHENON_INSTRUMENT
   auto pmb = rc->GetBlockPointer();
 
   IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
@@ -476,8 +476,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   size_t scratch_size_in_bytes = parthenon::ScratchPad2D<Real>::shmem_size(nvar, nx1);
   // get x-fluxes
   pmb->par_for_outer(
-      "x1 flux", 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e,
-      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+      PARTHENON_AUTO_LABEL, 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+      jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
         parthenon::ScratchPad2D<Real> ql(member.team_scratch(scratch_level), nvar, nx1);
         parthenon::ScratchPad2D<Real> qr(member.team_scratch(scratch_level), nvar, nx1);
         // get reconstructed state on faces
@@ -509,8 +509,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   // get y-fluxes
   if (pmb->pmy_mesh->ndim >= 2) {
     pmb->par_for_outer(
-        "x2 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e + 1,
-        KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+        jb.e + 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clear inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
           // reconstructed arrays for different `j` with `j` being part of the outer
@@ -552,7 +552,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   // get z-fluxes
   if (pmb->pmy_mesh->ndim == 3) {
     pmb->par_for_outer(
-        "x3 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1, jb.s, jb.e,
+        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1,
+        jb.s, jb.e,
         KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clear inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
@@ -592,7 +593,6 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
         });
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Advection_CalculateFluxes
   return TaskStatus::complete;
 }
 

diff --git a/example/advection/parthenon_app_inputs.cpp b/example/advection/parthenon_app_inputs.cpp
@@ -73,7 +73,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   if (profile == "block") profile_type = 3;
 
   pmb->par_for(
-      "Advection::ProblemGenerator", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
         if (profile_type == 0) {
           Real x = cos_a2 * (coords.Xc<1>(i) * cos_a3 + coords.Xc<2>(j) * sin_a3) +
@@ -99,8 +99,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   // initialize some arbitrary cells in the first block that move in all 6 directions
   if (profile_type == 3 && block_id == 0) {
     pmb->par_for(
-        "Advection::ProblemGenerator bvals test", 0, 1,
-        KOKKOS_LAMBDA(const int /*unused*/) {
+        PARTHENON_AUTO_LABEL, 0, 1, KOKKOS_LAMBDA(const int /*unused*/) {
           q(idx_adv, 4, 4, 4) = 10.0;
           q(idx_v, 4, 4, 4) = vx;
           q(idx_adv, 4, 6, 4) = 10.0;

diff --git a/example/calculate_pi/calculate_pi.cpp b/example/calculate_pi/calculate_pi.cpp
@@ -88,7 +88,7 @@ void SetInOrOut(MeshBlockData<Real> *rc) {
   // Loop bounds are set to catch the case where the edge is between the
   // cell centers of the first/last real cell and the first ghost cell
   pmb->par_for(
-      "SetInOrOut", kb.s, kb.e, jb.s - 1, jb.e + 1, ib.s - 1, ib.e + 1,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s - 1, jb.e + 1, ib.s - 1, ib.e + 1,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         Real rsq = std::pow(coords.Xc<1>(i), 2) + std::pow(coords.Xc<2>(j), 2);
         if (rsq < radius * radius) {

diff --git a/example/kokkos_pi/kokkos_pi.cpp b/example/kokkos_pi/kokkos_pi.cpp
@@ -272,7 +272,7 @@ result_t naiveParFor(int n_block, int n_mesh, int n_iter, double radius) {
       auto inOrOut = base->PackVariables({Metadata::Independent});
       // iops = 0  fops = 11
       par_for(
-          DEFAULT_LOOP_PATTERN, "par_for in or out", DevExecSpace(), 0,
+          DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
           inOrOut.GetDim(4) - 1, nghost, inOrOut.GetDim(3) - nghost - 1, nghost,
           inOrOut.GetDim(2) - nghost - 1, nghost, inOrOut.GetDim(1) - nghost - 1,
           KOKKOS_LAMBDA(const int l, const int k_grid, const int j_grid,

diff --git a/example/particle_leapfrog/particle_leapfrog.cpp b/example/particle_leapfrog/particle_leapfrog.cpp
@@ -189,7 +189,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   // This hardcoded implementation should only used in PGEN and not during runtime
   // addition of particles as indices need to be taken into account.
   pmb->par_for(
-      "CreateParticles", 0, num_particles_this_block - 1, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, num_particles_this_block - 1, KOKKOS_LAMBDA(const int n) {
         const auto &m = ids_this_block(n);
 
         id(n) = m; // global unique id
@@ -227,7 +227,7 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator
   const Real ay = 0.0;
   const Real az = 0.0;
   pmb->par_for(
-      "Leapfrog", 0, max_active_index, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
         if (swarm_d.IsActive(n)) {
           // drift
           x(n) += v(0, n) * 0.5 * dt;

diff --git a/example/particle_tracers/particle_tracers.cpp b/example/particle_tracers/particle_tracers.cpp
@@ -182,7 +182,7 @@ TaskStatus AdvectTracers(MeshBlock *pmb, const StagedIntegrator *integrator) {
 
   auto swarm_d = swarm->GetDeviceContext();
   pmb->par_for(
-      "Tracer advection", 0, max_active_index, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
         if (swarm_d.IsActive(n)) {
           x(n) += vx * dt;
           y(n) += vy * dt;
@@ -219,13 +219,13 @@ TaskStatus DepositTracers(MeshBlock *pmb) {
   auto &tracer_dep = pmb->meshblock_data.Get()->Get("tracer_deposition").data;
   // Reset particle count
   pmb->par_for(
-      "ZeroParticleDep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) { tracer_dep(k, j, i) = 0.; });
 
   const int ndim = pmb->pmy_mesh->ndim;
 
   pmb->par_for(
-      "DepositTracers", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
         if (swarm_d.IsActive(n)) {
           int i = static_cast<int>(std::floor((x(n) - minx_i) / dx_i) + ib.s);
           int j = 0;
@@ -269,7 +269,7 @@ TaskStatus CalculateFluxes(MeshBlockData<Real> *mbd) {
 
   // Spatially first order upwind method
   pmb->par_for(
-      "CalculateFluxesX1", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e + 1,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e + 1,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         // X1
         if (vx > 0.) {
@@ -282,7 +282,7 @@ TaskStatus CalculateFluxes(MeshBlockData<Real> *mbd) {
   if (ndim > 1) {
     auto x2flux = mbd->Get("advected").flux[X2DIR].Get<4>();
     pmb->par_for(
-        "CalculateFluxesX2", kb.s, kb.e, jb.s, jb.e + 1, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e + 1, ib.s, ib.e,
         KOKKOS_LAMBDA(const int k, const int j, const int i) {
           // X2
           if (vy > 0.) {
@@ -296,7 +296,7 @@ TaskStatus CalculateFluxes(MeshBlockData<Real> *mbd) {
   if (ndim > 2) {
     auto x3flux = mbd->Get("advected").flux[X3DIR].Get<4>();
     pmb->par_for(
-        "CalculateFluxesX3", kb.s, kb.e + 1, jb.s, jb.e, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, kb.s, kb.e + 1, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int k, const int j, const int i) {
           // X3
           if (vz > 0.) {
@@ -355,7 +355,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   const Real kwave = 2. * M_PI / (x_max_mesh - x_min_mesh);
 
   pmb->par_for(
-      "Init advected profile", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         advected(k, j, i) = advected_mean + advected_amp * sin(kwave * coords.Xc<1>(i));
       });
@@ -387,7 +387,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   // This hardcoded implementation should only used in PGEN and not during runtime
   // addition of particles as indices need to be taken into account.
   pmb->par_for(
-      "CreateParticles", 0, num_tracers_meshblock - 1, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, num_tracers_meshblock - 1, KOKKOS_LAMBDA(const int n) {
         auto rng_gen = rng_pool.get_state();
 
         // Rejection sample the x position