From df1947ac73b33f60915b4a19113c80b55ecf61dc Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Mon, 30 Oct 2023 16:03:03 -0600
Subject: [PATCH 01/39] Add option for zone local block diagonal smoother

---
 src/solvers/bicgstab_solver.hpp |  3 +-
 src/solvers/mg_solver.hpp       | 93 ++++++++++++++++++++++++---------
 2 files changed, 69 insertions(+), 27 deletions(-)
diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index 00e071b1763f..f661747ea314 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -32,6 +32,7 @@ namespace parthenon {
 namespace solvers {
 
 struct BiCGSTABParams {
+  MGParams mg_params;
   int max_iters = 10;
   Real residual_tolerance = 1.e-12;
   Real restart_threshold = -1.0;
@@ -52,7 +53,7 @@ class BiCGSTABSolver {
 
   BiCGSTABSolver(StateDescriptor *pkg, BiCGSTABParams params_in,
                  equations eq_in = equations(), std::vector<int> shape = {})
-      : preconditioner(pkg, MGParams(), eq_in, shape), params_(params_in),
+      : preconditioner(pkg, params_in.mg_params, eq_in, shape), params_(params_in),
         iter_counter(0), eqs_(eq_in) {
     using namespace refinement_ops;
     auto mu = Metadata({Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index f49e0e1ac5f4..b161adad1411 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -35,6 +35,7 @@ struct MGParams {
   Real residual_tolerance = 1.e-12;
   bool do_FAS = true;
   std::string smoother = "SRJ2";
+  bool two_by_two_diagonal = false;
 };
 
 template <class u, class rhs, class equations>
@@ -64,7 +65,12 @@ class MGSolver {
 
     auto mu0 = Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, shape);
     pkg->AddField(u0::name(), mu0);
-    pkg->AddField(D::name(), mu0);
+    auto Dshape = shape;
+    if (params_.two_by_two_diagonal) { 
+       Dshape = std::vector<int>{4};
+    }
+    auto mD = Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, Dshape);
+    pkg->AddField(D::name(), mD);
   }
 
   TaskID AddTasks(TaskList & /*tl*/, IterativeTasks &itl, TaskID dependence,
@@ -142,7 +148,7 @@ class MGSolver {
   enum class GSType { all, red, black };
 
   template <class rhs_t, class Axold_t, class D_t, class xold_t, class xnew_t>
-  static TaskStatus Jacobi(std::shared_ptr<MeshData<Real>> &md, double weight,
+  TaskStatus Jacobi(std::shared_ptr<MeshData<Real>> &md, double weight,
                            GSType gs_type = GSType::all) {
     using namespace parthenon;
     const int ndim = md->GetMeshPointer()->ndim;
@@ -158,29 +164,64 @@ class MGSolver {
     auto desc =
         parthenon::MakePackDescriptor<xold_t, xnew_t, Axold_t, rhs_t, D_t>(md.get());
     auto pack = desc.GetPack(md.get(), include_block);
-    parthenon::par_for(
-        DEFAULT_LOOP_PATTERN, "CaclulateFluxes", DevExecSpace(), 0, pack.GetNBlocks() - 1,
-        kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-        KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
-          const auto &coords = pack.GetCoordinates(b);
-          if ((i + j + k) % 2 == 1 && gs_type == GSType::red) return;
-          if ((i + j + k) % 2 == 0 && gs_type == GSType::black) return;
-
-          const int nvars =
-              pack.GetUpperBound(b, D_t()) - pack.GetLowerBound(b, D_t()) + 1;
-          for (int c = 0; c < nvars; ++c) {
-            Real diag_elem = pack(b, te, D_t(c), k, j, i);
-
-            // Get the off-diagonal contribution to Ax = (D + L + U)x = y
-            Real off_diag = pack(b, te, Axold_t(c), k, j, i) -
-                            diag_elem * pack(b, te, xold_t(c), k, j, i);
-
-            Real val = pack(b, te, rhs_t(c), k, j, i) - off_diag;
-            pack(b, te, xnew_t(c), k, j, i) =
-                weight * val / diag_elem +
-                (1.0 - weight) * pack(b, te, xold_t(c), k, j, i);
-          }
-        });
+    if (params_.two_by_two_diagonal) {
+      parthenon::par_for(
+          DEFAULT_LOOP_PATTERN, "CaclulateFluxes", DevExecSpace(), 0, pack.GetNBlocks() - 1,
+          kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+          KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+            const auto &coords = pack.GetCoordinates(b);
+            if ((i + j + k) % 2 == 1 && gs_type == GSType::red) return;
+            if ((i + j + k) % 2 == 0 && gs_type == GSType::black) return;
+
+            const int nvars =
+                pack.GetUpperBound(b, xnew_t()) - pack.GetLowerBound(b, xnew_t()) + 1;
+            
+            const Real D11 = pack(b, te, D_t(0), k, j, i); 
+            const Real D22 = pack(b, te, D_t(1), k, j, i); 
+            const Real D12 = pack(b, te, D_t(2), k, j, i); 
+            const Real D21 = pack(b, te, D_t(3), k, j, i); 
+            const Real det = D11 * D22 - D12 * D21;
+            
+            const Real Du0 = D11 * pack(b, te, xold_t(0), k, j, i)
+                           + D12 * pack(b, te, xold_t(1), k, j, i);
+            const Real Du1 = D21 * pack(b, te, xold_t(0), k, j, i)
+                           + D22 * pack(b, te, xold_t(1), k, j, i);
+
+            const Real t0 = pack(b, te, rhs_t(0), k, j, i) - pack(b, te, Axold_t(0), k, j, i) + Du0;  
+            const Real t1 = pack(b, te, rhs_t(1), k, j, i) - pack(b, te, Axold_t(1), k, j, i) + Du1;
+            
+            const Real v0 = (D22 * t0 - D12 * t1) / det;
+            const Real v1 = (-D21 * t0 + D11 * t1) / det;
+            
+            pack(b, te, xnew_t(0), k, j, i) = weight * v0 + (1.0 - weight) * pack(b, te, xold_t(0), k, j, i); 
+            pack(b, te, xnew_t(1), k, j, i) = weight * v1 + (1.0 - weight) * pack(b, te, xold_t(1), k, j, i); 
+          });
+     } else { 
+      parthenon::par_for(
+          DEFAULT_LOOP_PATTERN, "CaclulateFluxes", DevExecSpace(), 0, pack.GetNBlocks() - 1,
+          kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+          KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+            const auto &coords = pack.GetCoordinates(b);
+            if ((i + j + k) % 2 == 1 && gs_type == GSType::red) return;
+            if ((i + j + k) % 2 == 0 && gs_type == GSType::black) return;
+
+            const int nvars =
+                pack.GetUpperBound(b, xnew_t()) - pack.GetLowerBound(b, xnew_t()) + 1;
+
+            for (int c = 0; c < nvars; ++c) {
+              Real diag_elem = pack(b, te, D_t(c), k, j, i);
+
+              // Get the off-diagonal contribution to Ax = (D + L + U)x = y
+              Real off_diag = pack(b, te, Axold_t(c), k, j, i) -
+                              diag_elem * pack(b, te, xold_t(c), k, j, i);
+
+              Real val = pack(b, te, rhs_t(c), k, j, i) - off_diag;
+              pack(b, te, xnew_t(c), k, j, i) =
+                  weight * val / diag_elem +
+                  (1.0 - weight) * pack(b, te, xold_t(c), k, j, i);
+            }
+          }); 
+      }
     return TaskStatus::complete;
   }
 
@@ -191,7 +232,7 @@ class MGSolver {
 
     auto comm = AddBoundaryExchangeTasks<comm_boundary>(depends_on, tl, md, multilevel);
     auto mat_mult = eqs_.template Ax<in_t, out_t>(tl, comm, md);
-    return tl.AddTask(mat_mult, Jacobi<rhs, out_t, D, in_t, out_t>, md, omega,
+    return tl.AddTask(mat_mult, &MGSolver::Jacobi<rhs, out_t, D, in_t, out_t>, this, md, omega,
                       GSType::all);
   }
 

From 4e4a64a3e8ee4f42286afde8bbeeae697e3e1813 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 31 Oct 2023 14:58:43 -0600
Subject: [PATCH 02/39] Switch to static pack descriptors for boundary
 conditions

---
 src/bvals/boundary_conditions_generic.hpp | 40 ++++++++++++++++-------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/src/bvals/boundary_conditions_generic.hpp b/src/bvals/boundary_conditions_generic.hpp
index b7300e6298d2..775d0ab321aa 100644
--- a/src/bvals/boundary_conditions_generic.hpp
+++ b/src/bvals/boundary_conditions_generic.hpp
@@ -18,6 +18,8 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "basic_types.hpp"
@@ -34,6 +36,31 @@ namespace BoundaryFunction {
 enum class BCSide { Inner, Outer };
 enum class BCType { Outflow, Reflect, ConstantDeriv, Fixed, FixedFace };
 
+namespace impl {
+using desc_key_t = std::tuple<bool, TopologicalType>;
+template <class... var_ts>
+using map_bc_pack_descriptor_t = std::unordered_map<desc_key_t, typename SparsePack<var_ts...>::Descriptor, tuple_hash<desc_key_t>>;
+
+template <class... var_ts> 
+map_bc_pack_descriptor_t<var_ts...>
+GetPackDescriptorMap(std::shared_ptr<MeshBlockData<Real>> &rc) {
+  std::vector<std::pair<TopologicalType, MetadataFlag>> elements
+                                                       {{TopologicalType::Cell, Metadata::Cell}, 
+                                                        {TopologicalType::Face, Metadata::Face},
+                                                        {TopologicalType::Edge, Metadata::Edge},
+                                                        {TopologicalType::Node, Metadata::Node}};
+  map_bc_pack_descriptor_t<var_ts...> my_map;
+  for (auto [tt, md] : elements) {
+    std::vector<MetadataFlag> flags{Metadata::FillGhost};
+    flags.push_back(md);
+    std::set<PDOpt> opts{PDOpt::Coarse};
+    my_map.emplace(std::make_pair(desc_key_t{true, tt}, MakePackDescriptor<var_ts...>(rc.get(), flags, opts))); 
+    my_map.emplace(std::make_pair(desc_key_t{false, tt}, MakePackDescriptor<var_ts...>(rc.get(), flags))); 
+  }
+  return my_map;
+}
+} // namespace impl
+
 template <CoordinateDirection DIR, BCSide SIDE, BCType TYPE, class... var_ts>
 void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
                TopologicalElement el, Real val) {
@@ -46,17 +73,8 @@ void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
   constexpr bool X3 = (DIR == X3DIR);
   constexpr bool INNER = (SIDE == BCSide::Inner);
 
-  std::vector<MetadataFlag> flags{Metadata::FillGhost};
-  if (GetTopologicalType(el) == TopologicalType::Cell) flags.push_back(Metadata::Cell);
-  if (GetTopologicalType(el) == TopologicalType::Face) flags.push_back(Metadata::Face);
-  if (GetTopologicalType(el) == TopologicalType::Edge) flags.push_back(Metadata::Edge);
-  if (GetTopologicalType(el) == TopologicalType::Node) flags.push_back(Metadata::Node);
-
-  std::set<PDOpt> opts;
-  if (coarse) opts = {PDOpt::Coarse};
-  auto desc = MakePackDescriptor<var_ts...>(
-      rc->GetBlockPointer()->pmy_mesh->resolved_packages.get(), flags, opts);
-  auto q = desc.GetPack(rc.get());
+  static auto descriptors = impl::GetPackDescriptorMap<var_ts...>(rc);
+  auto q = descriptors[impl::desc_key_t{coarse, GetTopologicalType(el)}].GetPack(rc.get());
   const int b = 0;
   const int lstart = q.GetLowerBoundHost(b);
   const int lend = q.GetUpperBoundHost(b);

From 7bccbf0df60a6c2565d992efc0df4c90568f5758 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 31 Oct 2023 14:59:10 -0600
Subject: [PATCH 03/39] limit creation of std::vector<bool>

---
 src/interface/sparse_pack.hpp | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/interface/sparse_pack.hpp b/src/interface/sparse_pack.hpp
index fc30cacceed2..dbde38279d0c 100644
--- a/src/interface/sparse_pack.hpp
+++ b/src/interface/sparse_pack.hpp
@@ -143,16 +143,15 @@ class SparsePack : public SparsePackBase {
     // accessed on device via instance of types in the type list Ts...
     // The pack will be created and accessible on the device
     template <class T>
-    SparsePack GetPack(T *pmd, std::vector<bool> include_block = {},
+    SparsePack GetPack(T *pmd, std::vector<bool> &include_block,
                        bool only_fine_two_level_composite_blocks = true) const {
       // If this is a composite grid MeshData object, only include blocks on
       // the finer level
       if constexpr (std::is_same<T, MeshData<Real>>::value) {
         if (pmd->grid.type == GridType::two_level_composite &&
             only_fine_two_level_composite_blocks) {
-          if (include_block.size() != pmd->NumBlocks()) {
-            include_block = std::vector<bool>(pmd->NumBlocks(), true);
-          }
+          PARTHENON_REQUIRE(include_block.size() == pmd->NumBlocks(), 
+                            "Passed wrong size block include list."); 
           int fine_level = pmd->grid.logical_level;
           for (int b = 0; b < pmd->NumBlocks(); ++b)
             include_block[b] =
@@ -162,6 +161,27 @@ class SparsePack : public SparsePackBase {
       }
       return SparsePack(SparsePackBase::GetPack(pmd, *this, include_block));
     }
+    
+    template <class T>
+    SparsePack GetPack(T *pmd, bool only_fine_two_level_composite_blocks = true) const {
+      // If this is a composite grid MeshData object, only include blocks on
+      // the finer level
+      if constexpr (std::is_same<T, MeshData<Real>>::value) {
+        if (pmd->grid.type == GridType::two_level_composite &&
+            only_fine_two_level_composite_blocks) {
+          auto include_block = std::vector<bool>(pmd->NumBlocks(), true);
+          int fine_level = pmd->grid.logical_level;
+          for (int b = 0; b < pmd->NumBlocks(); ++b)
+            include_block[b] =
+                include_block[b] &&
+                (fine_level == pmd->GetBlockData(b)->GetBlockPointer()->loc.level());
+          return SparsePack(SparsePackBase::GetPack(pmd, *this, include_block));
+        } else { 
+          return SparsePack(SparsePackBase::GetPack(pmd, *this, std::vector<bool>{})); 
+        }
+      }
+      return SparsePack(SparsePackBase::GetPack(pmd, *this, std::vector<bool>{}));
+    }
 
     SparsePackIdxMap GetMap() const {
       PARTHENON_REQUIRE(sizeof...(Ts) == 0,

From 153c67744830db197848feac168508c910030833 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 31 Oct 2023 15:00:43 -0600
Subject: [PATCH 04/39] Add option to get list of internal variable names

---
 src/solvers/bicgstab_solver.hpp | 1 +
 src/solvers/mg_solver.hpp       | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index f661747ea314..f5157a738a29 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -50,6 +50,7 @@ class BiCGSTABSolver {
   INTERNALSOLVERVARIABLE(x, r);
   INTERNALSOLVERVARIABLE(x, p);
   INTERNALSOLVERVARIABLE(x, u);
+    }
 
   BiCGSTABSolver(StateDescriptor *pkg, BiCGSTABParams params_in,
                  equations eq_in = equations(), std::vector<int> shape = {})
diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index b161adad1411..10129310822f 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -45,7 +45,9 @@ class MGSolver {
   INTERNALSOLVERVARIABLE(u, temp);    // Temporary storage
   INTERNALSOLVERVARIABLE(u, u0);      // Storage for initial solution during FAS
   INTERNALSOLVERVARIABLE(u, D);       // Storage for (approximate) diagonal
-
+  std::vector<std::string> GetInternalVariableNames() const { 
+    return {res_err::name(), temp::name(), u0::name(), D::name()};
+  }
   MGSolver(StateDescriptor *pkg, MGParams params_in, equations eq_in = equations(),
            std::vector<int> shape = {})
       : params_(params_in), iter_counter(0), eqs_(eq_in) {

From 44085c8250bd6e8607927889a4020737adf809a7 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 31 Oct 2023 15:01:39 -0600
Subject: [PATCH 05/39] try again to add internal variables

---
 src/solvers/bicgstab_solver.hpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index f5157a738a29..2bce5cd5ebb4 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -50,7 +50,22 @@ class BiCGSTABSolver {
   INTERNALSOLVERVARIABLE(x, r);
   INTERNALSOLVERVARIABLE(x, p);
   INTERNALSOLVERVARIABLE(x, u);
+  
+  std::vector<std::string> GetInternalVariableNames() const { 
+    std::vector<std::string> names{rhat0::name(), 
+                                   v::name(), 
+                                   h::name(), 
+                                   s::name(), 
+                                   t::name(), 
+                                   r::name(), 
+                                   p::name(), 
+                                   u::name()};
+    if (params_.precondition) { 
+      auto pre_names = preconditioner.GetInternalVariableNames();
+      names.insert(names.end(), pre_names.begin(), pre_names.end());
     }
+    return names;
+  }
 
   BiCGSTABSolver(StateDescriptor *pkg, BiCGSTABParams params_in,
                  equations eq_in = equations(), std::vector<int> shape = {})

From c0d96350b70df17c3142d96f5731ba030f58316f Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 31 Oct 2023 15:02:07 -0600
Subject: [PATCH 06/39] Add printout flag and allow for changing residual
 tolerance

---
 src/solvers/bicgstab_solver.hpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index 2bce5cd5ebb4..1833049d95a4 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -37,6 +37,7 @@ struct BiCGSTABParams {
   Real residual_tolerance = 1.e-12;
   Real restart_threshold = -1.0;
   bool precondition = true;
+  bool print_per_step = false;
 };
 
 template <class x, class rhs, class equations>
@@ -119,7 +120,7 @@ class BiCGSTABSolver {
         this, i);
     region.AddRegionalDependencies(reg_dep_id, i, initialize);
     reg_dep_id++;
-    if (i == 0) {
+    if (i == 0 && params_.print_per_step) {
       tl.AddTask(dependence, [&]() {
         if (Globals::my_rank == 0)
           printf("# [0] v-cycle\n# [1] rms-residual\n# [2] rms-error\n");
@@ -174,7 +175,7 @@ class BiCGSTABSolver {
         [&](BiCGSTABSolver *solver, Mesh *pmesh, int partition) {
           if (partition != 0) return TaskStatus::complete;
           Real rms_res = std::sqrt(solver->residual.val / pmesh->GetTotalCells());
-          if (Globals::my_rank == 0)
+          if (Globals::my_rank == 0 && solver->params_.print_per_step)
             printf("%i %e\n", solver->iter_counter * 2 + 1, rms_res);
           return TaskStatus::complete;
         },
@@ -226,7 +227,7 @@ class BiCGSTABSolver {
           get_res2,
           [&](BiCGSTABSolver *solver, Mesh *pmesh) {
             Real rms_err = std::sqrt(solver->residual.val / pmesh->GetTotalCells());
-            if (Globals::my_rank == 0)
+            if (Globals::my_rank == 0 && solver->params_.print_per_step)
               printf("%i %e\n", solver->iter_counter * 2 + 2, rms_err);
             return TaskStatus::complete;
           },
@@ -285,7 +286,10 @@ class BiCGSTABSolver {
 
   Real GetFinalResidual() const { return final_residual; }
   int GetFinalIterations() const { return final_iteration; }
-
+  
+  void UpdateResidualTolerance(Real tol) { 
+    params_.residual_tolerance = tol;
+  }
  protected:
   MGSolver<u, rhs, equations> preconditioner;
   BiCGSTABParams params_;

From fb16c6781dfb739eb14b7f56c35afa510a93177c Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 31 Oct 2023 15:03:07 -0600
Subject: [PATCH 07/39] Minimize boundary communication

---
 src/solvers/mg_solver.hpp | 45 +++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 10129310822f..0f12e2591241 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -53,14 +53,13 @@ class MGSolver {
       : params_(params_in), iter_counter(0), eqs_(eq_in) {
     using namespace parthenon::refinement_ops;
     auto mres_err =
-        Metadata({Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
+        Metadata({Metadata::Cell, Metadata::Independent,
                   Metadata::GMGRestrict, Metadata::GMGProlongate, Metadata::OneCopy},
                  shape);
     mres_err.RegisterRefinementOps<ProlongateSharedLinear, RestrictAverage>();
     pkg->AddField(res_err::name(), mres_err);
 
-    auto mtemp = Metadata({Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
-                           Metadata::WithFluxes, Metadata::OneCopy},
+    auto mtemp = Metadata({Metadata::Cell, Metadata::Independent, Metadata::OneCopy},
                           shape);
     mtemp.RegisterRefinementOps<ProlongateSharedLinear, RestrictAverage>();
     pkg->AddField(temp::name(), mtemp);
@@ -91,7 +90,8 @@ class MGSolver {
     auto mg_finest =
         AddLinearOperatorTasks(region, itl, dependence, partition, reg_dep_id, pmesh);
     auto &md = pmesh->mesh_data.GetOrAdd("base", partition);
-    auto calc_pointwise_res = eqs_.template Ax<u, res_err>(itl, mg_finest, md);
+    auto comm = AddBoundaryExchangeTasks<BoundaryType::any>(mg_finest, itl, md, true);
+    auto calc_pointwise_res = eqs_.template Ax<u, res_err>(itl, comm, md);
     calc_pointwise_res = itl.AddTask(
         calc_pointwise_res, AddFieldsAndStoreInteriorSelect<rhs, res_err, res_err>, md,
         1.0, -1.0, false);
@@ -163,7 +163,7 @@ class MGSolver {
     int nblocks = md->NumBlocks();
     std::vector<bool> include_block(nblocks, true);
 
-    auto desc =
+    static auto desc =
         parthenon::MakePackDescriptor<xold_t, xnew_t, Axold_t, rhs_t, D_t>(md.get());
     auto pack = desc.GetPack(md.get(), include_block);
     if (params_.two_by_two_diagonal) {
@@ -261,13 +261,12 @@ class MGSolver {
     depends_on = tl.AddTask(depends_on, CopyData<u, temp, false>, md);
     auto jacobi1 = AddJacobiIteration<comm_boundary, u, temp>(tl, depends_on, multilevel,
                                                               omega[ndim - 1][0], md);
-    if (stages < 2) {
-      return tl.AddTask(jacobi1, CopyData<temp, u, true>, md);
-    }
-    auto jacobi2 = AddJacobiIteration<comm_boundary, temp, u>(tl, jacobi1, multilevel,
-                                                              omega[ndim - 1][1], md);
-    if (stages < 3) return jacobi2;
-    auto jacobi3 = AddJacobiIteration<comm_boundary, u, temp>(tl, jacobi2, multilevel,
+    auto copy1 = tl.AddTask(jacobi1, CopyData<temp, u, true>, md);
+    if (stages < 2) return copy1; 
+    auto jacobi2 = AddJacobiIteration<comm_boundary, u, temp>(tl, copy1, multilevel,
+    auto copy2 = tl.AddTask(jacobi2, CopyData<temp, u, true>, md);
+    if (stages < 3) return copy2;
+    auto jacobi3 = AddJacobiIteration<comm_boundary, u, temp>(tl, copy2, multilevel,
                                                               omega[ndim - 1][2], md);
     return tl.AddTask(jacobi3, CopyData<temp, u, true>, md);
   }
@@ -312,14 +311,18 @@ class MGSolver {
       reg_dep_id++;
       // 1. Copy residual from dual purpose communication field to the rhs, should be
       // actual RHS for finest level
-      auto copy_u = tl.AddTask(set_from_finer, CopyData<u, u0, true>, md);
       if (!do_FAS) {
-        auto zero_u = tl.AddTask(copy_u, SetToZero<u, true>, md);
+        auto zero_u = tl.AddTask(set_from_finer, SetToZero<u, true>, md);
         auto copy_rhs = tl.AddTask(set_from_finer, CopyData<res_err, rhs, true>, md);
-        set_from_finer = zero_u | copy_u | copy_rhs;
+        set_from_finer = zero_u | copy_rhs;
       } else {
+        // TODO(LFR): Determine if this boundary exchange task is required, I think it is 
+        // to make sure that the boundaries of the restricted u are up to date before 
+        // calling Ax. That being said, at least in one case commenting this line out 
+        // didn't seem to impact the solution. 
         set_from_finer = AddBoundaryExchangeTasks<BoundaryType::gmg_same>(
             set_from_finer, tl, md, multilevel);
+        set_from_finer = tl.AddTask(set_from_finer, CopyData<u, u0, true>, md);
         // This should set the rhs only in blocks that correspond to interior nodes, the
         // RHS of leaf blocks that are on this GMG level should have already been set on
         // entry into multigrid
@@ -327,7 +330,6 @@ class MGSolver {
         set_from_finer = tl.AddTask(
             set_from_finer, AddFieldsAndStoreInteriorSelect<temp, res_err, rhs, true>, md,
             1.0, 1.0, true);
-        set_from_finer = set_from_finer | copy_u;
       }
     } else {
       set_from_finer = tl.AddTask(set_from_finer, CopyData<u, u0, true>, md);
@@ -383,7 +385,7 @@ class MGSolver {
 
     // 9. Send communication field to next finer level (should be error field for that
     // level)
-    TaskID last_task;
+    TaskID last_task = post_smooth;
     if (level < max_level) {
       auto copy_over = post_smooth;
       if (!do_FAS) {
@@ -393,14 +395,15 @@ class MGSolver {
                                    md, 1.0, -1.0);
         copy_over = calc_err;
       }
+      // This is required to make sure boundaries of res_err are up to date before prolongation
+      copy_over = tl.AddTask(copy_over, CopyData<res_err, u, true>, md);
       auto boundary =
           AddBoundaryExchangeTasks<BoundaryType::gmg_same>(copy_over, tl, md, multilevel);
+      auto copy_back = tl.AddTask(boundary, CopyData<u, res_err, true>, md);
       last_task =
-          tl.AddTask(boundary, SendBoundBufs<BoundaryType::gmg_prolongate_send>, md);
-    } else {
-      last_task = AddBoundaryExchangeTasks<BoundaryType::gmg_same>(post_smooth, tl, md,
-                                                                   multilevel);
+          tl.AddTask(copy_back, SendBoundBufs<BoundaryType::gmg_prolongate_send>, md);
     }
+    // The boundaries are not up to date on return
     return last_task;
   }
 };

From 6f0be015efc3e41482012b9e1d44f5c4c08498e7 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 31 Oct 2023 15:03:22 -0600
Subject: [PATCH 08/39] Make pack descriptors static and other small changes

---
 src/solvers/solver_utils.hpp | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/solvers/solver_utils.hpp b/src/solvers/solver_utils.hpp
index 87a2b71814c3..92845a43c942 100644
--- a/src/solvers/solver_utils.hpp
+++ b/src/solvers/solver_utils.hpp
@@ -154,8 +154,8 @@ TaskStatus CopyData(const std::shared_ptr<MeshData<Real>> &md) {
   IndexRange jb = md->GetBoundsJ(IndexDomain::entire, te);
   IndexRange kb = md->GetBoundsK(IndexDomain::entire, te);
 
-  auto desc = parthenon::MakePackDescriptor<in, out>(md.get());
-  auto pack = desc.GetPack(md.get(), {}, only_fine_on_composite);
+  static auto desc = parthenon::MakePackDescriptor<in, out>(md.get());
+  auto pack = desc.GetPack(md.get(), only_fine_on_composite);
   parthenon::par_for(
       DEFAULT_LOOP_PATTERN, "SetPotentialToZero", DevExecSpace(), 0,
       pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
@@ -170,7 +170,7 @@ TaskStatus CopyData(const std::shared_ptr<MeshData<Real>> &md) {
 template <class a_t, class b_t, class out, bool only_fine_on_composite = true>
 TaskStatus AddFieldsAndStoreInteriorSelect(const std::shared_ptr<MeshData<Real>> &md,
                                            Real wa = 1.0, Real wb = 1.0,
-                                           bool only_interior = false) {
+                                           bool only_interior_blocks = false) {
   using TE = parthenon::TopologicalElement;
   TE te = TE::CC;
   IndexRange ib = md->GetBoundsI(IndexDomain::entire, te);
@@ -179,13 +179,13 @@ TaskStatus AddFieldsAndStoreInteriorSelect(const std::shared_ptr<MeshData<Real>>
 
   int nblocks = md->NumBlocks();
   std::vector<bool> include_block(nblocks, true);
-  if (only_interior) {
+  if (only_interior_blocks) {
     // The neighbors array will only be set for a block if its a leaf block
     for (int b = 0; b < nblocks; ++b)
       include_block[b] = md->GetBlockData(b)->GetBlockPointer()->neighbors.size() == 0;
   }
 
-  auto desc = parthenon::MakePackDescriptor<a_t, b_t, out>(md.get());
+  static auto desc = parthenon::MakePackDescriptor<a_t, b_t, out>(md.get());
   auto pack = desc.GetPack(md.get(), include_block, only_fine_on_composite);
   parthenon::par_for(
       DEFAULT_LOOP_PATTERN, "SetPotentialToZero", DevExecSpace(), 0,
@@ -212,9 +212,8 @@ TaskStatus SetToZero(const std::shared_ptr<MeshData<Real>> &md) {
   int nblocks = md->NumBlocks();
   using TE = parthenon::TopologicalElement;
   TE te = TE::CC;
-  std::vector<bool> include_block(nblocks, true);
-  auto desc = parthenon::MakePackDescriptor<var>(md.get());
-  auto pack = desc.GetPack(md.get(), include_block, only_fine_on_composite);
+  static auto desc = parthenon::MakePackDescriptor<var>(md.get());
+  auto pack = desc.GetPack(md.get(), only_fine_on_composite);
   const size_t scratch_size_in_bytes = 0;
   const int scratch_level = 1;
   const int ng = parthenon::Globals::nghost;
@@ -224,9 +223,9 @@ TaskStatus SetToZero(const std::shared_ptr<MeshData<Real>> &md) {
       KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b) {
         auto cb = GetIndexShape(pack(b, te, 0), ng);
         const auto &coords = pack.GetCoordinates(b);
-        IndexRange ib = cb.GetBoundsI(IndexDomain::interior, te);
-        IndexRange jb = cb.GetBoundsJ(IndexDomain::interior, te);
-        IndexRange kb = cb.GetBoundsK(IndexDomain::interior, te);
+        IndexRange ib = cb.GetBoundsI(IndexDomain::entire, te);
+        IndexRange jb = cb.GetBoundsJ(IndexDomain::entire, te);
+        IndexRange kb = cb.GetBoundsK(IndexDomain::entire, te);
         parthenon::par_for_inner(parthenon::inner_loop_pattern_simdfor_tag, member, kb.s,
                                  kb.e, jb.s, jb.e, ib.s, ib.e, [&](int k, int j, int i) {
                                    const int nvars = pack.GetUpperBound(b, var()) -
@@ -247,7 +246,7 @@ TaskStatus DotProductLocal(const std::shared_ptr<MeshData<Real>> &md,
   IndexRange jb = md->GetBoundsJ(IndexDomain::interior, te);
   IndexRange kb = md->GetBoundsK(IndexDomain::interior, te);
 
-  auto desc = parthenon::MakePackDescriptor<a_t, b_t>(md.get());
+  static auto desc = parthenon::MakePackDescriptor<a_t, b_t>(md.get());
   auto pack = desc.GetPack(md.get());
   Real gsum(0);
   parthenon::par_reduce(

From c197c9f1ea5ee61ea8bd687f0e3085ba4070ceac Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 31 Oct 2023 16:44:30 -0600
Subject: [PATCH 09/39] fix commit error

---
 src/solvers/mg_solver.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 0f12e2591241..c04cee08eb5d 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -264,6 +264,7 @@ class MGSolver {
     auto copy1 = tl.AddTask(jacobi1, CopyData<temp, u, true>, md);
     if (stages < 2) return copy1; 
     auto jacobi2 = AddJacobiIteration<comm_boundary, u, temp>(tl, copy1, multilevel,
+                                                              omega[ndim - 1][1], md);
     auto copy2 = tl.AddTask(jacobi2, CopyData<temp, u, true>, md);
     if (stages < 3) return copy2;
     auto jacobi3 = AddJacobiIteration<comm_boundary, u, temp>(tl, copy2, multilevel,

From b1253d7828f36a1538622ae87656bc23df2314e4 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 31 Oct 2023 17:28:31 -0600
Subject: [PATCH 10/39] Remove unused Gauss-Seidel stuff

---
 src/solvers/mg_solver.hpp | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index c04cee08eb5d..601ae8f4de22 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -147,11 +147,9 @@ class MGSolver {
   // These functions apparently have to be public to compile with cuda since
   // they contain device side lambdas
  public:
-  enum class GSType { all, red, black };
 
   template <class rhs_t, class Axold_t, class D_t, class xold_t, class xnew_t>
-  TaskStatus Jacobi(std::shared_ptr<MeshData<Real>> &md, double weight,
-                           GSType gs_type = GSType::all) {
+  TaskStatus Jacobi(std::shared_ptr<MeshData<Real>> &md, double weight) {
     using namespace parthenon;
     const int ndim = md->GetMeshPointer()->ndim;
     using TE = parthenon::TopologicalElement;
@@ -172,12 +170,7 @@ class MGSolver {
           kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
           KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
             const auto &coords = pack.GetCoordinates(b);
-            if ((i + j + k) % 2 == 1 && gs_type == GSType::red) return;
-            if ((i + j + k) % 2 == 0 && gs_type == GSType::black) return;
 
-            const int nvars =
-                pack.GetUpperBound(b, xnew_t()) - pack.GetLowerBound(b, xnew_t()) + 1;
-            
             const Real D11 = pack(b, te, D_t(0), k, j, i); 
             const Real D22 = pack(b, te, D_t(1), k, j, i); 
             const Real D12 = pack(b, te, D_t(2), k, j, i); 
@@ -204,8 +197,6 @@ class MGSolver {
           kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
           KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
             const auto &coords = pack.GetCoordinates(b);
-            if ((i + j + k) % 2 == 1 && gs_type == GSType::red) return;
-            if ((i + j + k) % 2 == 0 && gs_type == GSType::black) return;
 
             const int nvars =
                 pack.GetUpperBound(b, xnew_t()) - pack.GetLowerBound(b, xnew_t()) + 1;
@@ -234,8 +225,7 @@ class MGSolver {
 
     auto comm = AddBoundaryExchangeTasks<comm_boundary>(depends_on, tl, md, multilevel);
     auto mat_mult = eqs_.template Ax<in_t, out_t>(tl, comm, md);
-    return tl.AddTask(mat_mult, &MGSolver::Jacobi<rhs, out_t, D, in_t, out_t>, this, md, omega,
-                      GSType::all);
+    return tl.AddTask(mat_mult, &MGSolver::Jacobi<rhs, out_t, D, in_t, out_t>, this, md, omega);
   }
 
   template <parthenon::BoundaryType comm_boundary, class TL_t>

From e474d5daeee111ff7a4a31a217be9fa25b505e84 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Wed, 1 Nov 2023 17:17:16 -0600
Subject: [PATCH 11/39] Allow for pointer to tolerance

---
 src/solvers/bicgstab_solver.hpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index 1833049d95a4..4187880c773d 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -71,7 +71,7 @@ class BiCGSTABSolver {
   BiCGSTABSolver(StateDescriptor *pkg, BiCGSTABParams params_in,
                  equations eq_in = equations(), std::vector<int> shape = {})
       : preconditioner(pkg, params_in.mg_params, eq_in, shape), params_(params_in),
-        iter_counter(0), eqs_(eq_in) {
+        iter_counter(0), eqs_(eq_in), presidual_tolerance(&params_in.residual_tolerance) {
     using namespace refinement_ops;
     auto mu = Metadata({Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
                         Metadata::WithFluxes, Metadata::GMGRestrict},
@@ -257,11 +257,11 @@ class BiCGSTABSolver {
     auto check = itl.SetCompletionTask(
         update_p | correct_x,
         [](BiCGSTABSolver *solver, Mesh *pmesh, int partition, int max_iter,
-           Real res_tol) {
+           Real *res_tol) {
           if (partition != 0) return TaskStatus::complete;
           solver->iter_counter++;
           Real rms_res = std::sqrt(solver->residual.val / pmesh->GetTotalCells());
-          if (rms_res < res_tol || solver->iter_counter >= max_iter) {
+          if (rms_res < *res_tol || solver->iter_counter >= max_iter) {
             solver->final_residual = rms_res;
             solver->final_iteration = solver->iter_counter;
             return TaskStatus::complete;
@@ -274,7 +274,7 @@ class BiCGSTABSolver {
           solver->residual.val = 0.0;
           return TaskStatus::iterate;
         },
-        this, pmesh, i, params_.max_iters, params_.residual_tolerance);
+        this, pmesh, i, params_.max_iters, presidual_tolerance);
     region.AddGlobalDependencies(reg_dep_id, i, check);
     reg_dep_id++;
 
@@ -287,8 +287,8 @@ class BiCGSTABSolver {
   Real GetFinalResidual() const { return final_residual; }
   int GetFinalIterations() const { return final_iteration; }
   
-  void UpdateResidualTolerance(Real tol) { 
-    params_.residual_tolerance = tol;
+  void UpdateResidualTolerance(Real *ptol) { 
+    presidual_tolerance = ptol;
   }
  protected:
   MGSolver<u, rhs, equations> preconditioner;
@@ -299,6 +299,7 @@ class BiCGSTABSolver {
   equations eqs_;
   Real final_residual;
   int final_iteration;
+  Real *presidual_tolerance;
 };
 
 } // namespace solvers

From 39cf7ed3ab3da932fb41690b6d56d6e4e52ac155 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 7 Nov 2023 17:44:28 -0700
Subject: [PATCH 12/39] Remove unecessary fill ghosts

---
 src/solvers/bicgstab_solver.hpp | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index 4187880c773d..63519558701e 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -40,17 +40,17 @@ struct BiCGSTABParams {
   bool print_per_step = false;
 };
 
-template <class x, class rhs, class equations>
+template <class u, class rhs, class equations>
 class BiCGSTABSolver {
  public:
-  INTERNALSOLVERVARIABLE(x, rhat0);
-  INTERNALSOLVERVARIABLE(x, v);
-  INTERNALSOLVERVARIABLE(x, h);
-  INTERNALSOLVERVARIABLE(x, s);
-  INTERNALSOLVERVARIABLE(x, t);
-  INTERNALSOLVERVARIABLE(x, r);
-  INTERNALSOLVERVARIABLE(x, p);
-  INTERNALSOLVERVARIABLE(x, u);
+  INTERNALSOLVERVARIABLE(u, rhat0);
+  INTERNALSOLVERVARIABLE(u, v);
+  INTERNALSOLVERVARIABLE(u, h);
+  INTERNALSOLVERVARIABLE(u, s);
+  INTERNALSOLVERVARIABLE(u, t);
+  INTERNALSOLVERVARIABLE(u, r);
+  INTERNALSOLVERVARIABLE(u, p);
+  INTERNALSOLVERVARIABLE(u, x);
   
   std::vector<std::string> GetInternalVariableNames() const { 
     std::vector<std::string> names{rhat0::name(), 
@@ -60,7 +60,7 @@ class BiCGSTABSolver {
                                    t::name(), 
                                    r::name(), 
                                    p::name(), 
-                                   u::name()};
+                                   x::name()};
     if (params_.precondition) { 
       auto pre_names = preconditioner.GetInternalVariableNames();
       names.insert(names.end(), pre_names.begin(), pre_names.end());
@@ -73,13 +73,9 @@ class BiCGSTABSolver {
       : preconditioner(pkg, params_in.mg_params, eq_in, shape), params_(params_in),
         iter_counter(0), eqs_(eq_in), presidual_tolerance(&params_in.residual_tolerance) {
     using namespace refinement_ops;
-    auto mu = Metadata({Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
-                        Metadata::WithFluxes, Metadata::GMGRestrict},
-                       shape);
-    mu.RegisterRefinementOps<ProlongateSharedLinear, RestrictAverage>();
     auto m_no_ghost =
         Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, shape);
-    pkg->AddField(u::name(), mu);
+    pkg->AddField(x::name(), m_no_ghost);
     pkg->AddField(rhat0::name(), m_no_ghost);
     pkg->AddField(v::name(), m_no_ghost);
     pkg->AddField(h::name(), m_no_ghost);
@@ -277,8 +273,8 @@ class BiCGSTABSolver {
         this, pmesh, i, params_.max_iters, presidual_tolerance);
     region.AddGlobalDependencies(reg_dep_id, i, check);
     reg_dep_id++;
-
-    return check;
+    
+    return tl.AddTask(check, CopyData<x, u>, md);
   }
 
   Real GetSquaredResidualSum() const { return residual.val; }

From 3db9655b254e90bad9a272b6080c1737d4200208 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 7 Nov 2023 17:45:02 -0700
Subject: [PATCH 13/39] Residual needs to have boundaries communicated for
 prolongation

---
 src/solvers/mg_solver.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 601ae8f4de22..96caf338d680 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -53,7 +53,7 @@ class MGSolver {
       : params_(params_in), iter_counter(0), eqs_(eq_in) {
     using namespace parthenon::refinement_ops;
     auto mres_err =
-        Metadata({Metadata::Cell, Metadata::Independent,
+        Metadata({Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
                   Metadata::GMGRestrict, Metadata::GMGProlongate, Metadata::OneCopy},
                  shape);
     mres_err.RegisterRefinementOps<ProlongateSharedLinear, RestrictAverage>();

From 920d613351dd0c33f09039b8366af29495235c92 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 7 Nov 2023 18:07:34 -0700
Subject: [PATCH 14/39] Remove fill ghost and explain why it is not required

---
 src/solvers/mg_solver.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 96caf338d680..8ccab33f42e5 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -52,8 +52,11 @@ class MGSolver {
            std::vector<int> shape = {})
       : params_(params_in), iter_counter(0), eqs_(eq_in) {
     using namespace parthenon::refinement_ops;
+    // The ghost cells of res_err need to be filled, but this is accomplished by 
+    // copying res_err into u, communicating, then copying u back into res_err 
+    // across all zones in a block
     auto mres_err =
-        Metadata({Metadata::Cell, Metadata::Independent, Metadata::FillGhost,
+        Metadata({Metadata::Cell, Metadata::Independent,
                   Metadata::GMGRestrict, Metadata::GMGProlongate, Metadata::OneCopy},
                  shape);
     mres_err.RegisterRefinementOps<ProlongateSharedLinear, RestrictAverage>();

From 9b0d269c5cd05b6e5667befebca732ecb3796d2a Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 7 Nov 2023 18:08:44 -0700
Subject: [PATCH 15/39] format and lint

---
 src/bvals/boundary_conditions_generic.hpp | 26 ++++---
 src/interface/sparse_pack.hpp             | 10 +--
 src/solvers/bicgstab_solver.hpp           | 25 +++----
 src/solvers/mg_solver.hpp                 | 85 ++++++++++++-----------
 4 files changed, 75 insertions(+), 71 deletions(-)

diff --git a/src/bvals/boundary_conditions_generic.hpp b/src/bvals/boundary_conditions_generic.hpp
index 775d0ab321aa..fc732e213afa 100644
--- a/src/bvals/boundary_conditions_generic.hpp
+++ b/src/bvals/boundary_conditions_generic.hpp
@@ -18,6 +18,7 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <tuple>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -39,23 +40,27 @@ enum class BCType { Outflow, Reflect, ConstantDeriv, Fixed, FixedFace };
 namespace impl {
 using desc_key_t = std::tuple<bool, TopologicalType>;
 template <class... var_ts>
-using map_bc_pack_descriptor_t = std::unordered_map<desc_key_t, typename SparsePack<var_ts...>::Descriptor, tuple_hash<desc_key_t>>;
+using map_bc_pack_descriptor_t =
+    std::unordered_map<desc_key_t, typename SparsePack<var_ts...>::Descriptor,
+                       tuple_hash<desc_key_t>>;
 
-template <class... var_ts> 
+template <class... var_ts>
 map_bc_pack_descriptor_t<var_ts...>
 GetPackDescriptorMap(std::shared_ptr<MeshBlockData<Real>> &rc) {
-  std::vector<std::pair<TopologicalType, MetadataFlag>> elements
-                                                       {{TopologicalType::Cell, Metadata::Cell}, 
-                                                        {TopologicalType::Face, Metadata::Face},
-                                                        {TopologicalType::Edge, Metadata::Edge},
-                                                        {TopologicalType::Node, Metadata::Node}};
+  std::vector<std::pair<TopologicalType, MetadataFlag>> elements{
+      {TopologicalType::Cell, Metadata::Cell},
+      {TopologicalType::Face, Metadata::Face},
+      {TopologicalType::Edge, Metadata::Edge},
+      {TopologicalType::Node, Metadata::Node}};
   map_bc_pack_descriptor_t<var_ts...> my_map;
   for (auto [tt, md] : elements) {
     std::vector<MetadataFlag> flags{Metadata::FillGhost};
     flags.push_back(md);
     std::set<PDOpt> opts{PDOpt::Coarse};
-    my_map.emplace(std::make_pair(desc_key_t{true, tt}, MakePackDescriptor<var_ts...>(rc.get(), flags, opts))); 
-    my_map.emplace(std::make_pair(desc_key_t{false, tt}, MakePackDescriptor<var_ts...>(rc.get(), flags))); 
+    my_map.emplace(std::make_pair(desc_key_t{true, tt},
+                                  MakePackDescriptor<var_ts...>(rc.get(), flags, opts)));
+    my_map.emplace(std::make_pair(desc_key_t{false, tt},
+                                  MakePackDescriptor<var_ts...>(rc.get(), flags)));
   }
   return my_map;
 }
@@ -74,7 +79,8 @@ void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
   constexpr bool INNER = (SIDE == BCSide::Inner);
 
   static auto descriptors = impl::GetPackDescriptorMap<var_ts...>(rc);
-  auto q = descriptors[impl::desc_key_t{coarse, GetTopologicalType(el)}].GetPack(rc.get());
+  auto q =
+      descriptors[impl::desc_key_t{coarse, GetTopologicalType(el)}].GetPack(rc.get());
   const int b = 0;
   const int lstart = q.GetLowerBoundHost(b);
   const int lend = q.GetUpperBoundHost(b);
diff --git a/src/interface/sparse_pack.hpp b/src/interface/sparse_pack.hpp
index dbde38279d0c..e64587039279 100644
--- a/src/interface/sparse_pack.hpp
+++ b/src/interface/sparse_pack.hpp
@@ -150,8 +150,8 @@ class SparsePack : public SparsePackBase {
       if constexpr (std::is_same<T, MeshData<Real>>::value) {
         if (pmd->grid.type == GridType::two_level_composite &&
             only_fine_two_level_composite_blocks) {
-          PARTHENON_REQUIRE(include_block.size() == pmd->NumBlocks(), 
-                            "Passed wrong size block include list."); 
+          PARTHENON_REQUIRE(include_block.size() == pmd->NumBlocks(),
+                            "Passed wrong size block include list.");
           int fine_level = pmd->grid.logical_level;
           for (int b = 0; b < pmd->NumBlocks(); ++b)
             include_block[b] =
@@ -161,7 +161,7 @@ class SparsePack : public SparsePackBase {
       }
       return SparsePack(SparsePackBase::GetPack(pmd, *this, include_block));
     }
-    
+
     template <class T>
     SparsePack GetPack(T *pmd, bool only_fine_two_level_composite_blocks = true) const {
       // If this is a composite grid MeshData object, only include blocks on
@@ -176,8 +176,8 @@ class SparsePack : public SparsePackBase {
                 include_block[b] &&
                 (fine_level == pmd->GetBlockData(b)->GetBlockPointer()->loc.level());
           return SparsePack(SparsePackBase::GetPack(pmd, *this, include_block));
-        } else { 
-          return SparsePack(SparsePackBase::GetPack(pmd, *this, std::vector<bool>{})); 
+        } else {
+          return SparsePack(SparsePackBase::GetPack(pmd, *this, std::vector<bool>{}));
         }
       }
       return SparsePack(SparsePackBase::GetPack(pmd, *this, std::vector<bool>{}));
diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index 63519558701e..73be7eeb6e6f 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -51,17 +51,11 @@ class BiCGSTABSolver {
   INTERNALSOLVERVARIABLE(u, r);
   INTERNALSOLVERVARIABLE(u, p);
   INTERNALSOLVERVARIABLE(u, x);
-  
-  std::vector<std::string> GetInternalVariableNames() const { 
-    std::vector<std::string> names{rhat0::name(), 
-                                   v::name(), 
-                                   h::name(), 
-                                   s::name(), 
-                                   t::name(), 
-                                   r::name(), 
-                                   p::name(), 
-                                   x::name()};
-    if (params_.precondition) { 
+
+  std::vector<std::string> GetInternalVariableNames() const {
+    std::vector<std::string> names{rhat0::name(), v::name(), h::name(), s::name(),
+                                   t::name(),     r::name(), p::name(), x::name()};
+    if (params_.precondition) {
       auto pre_names = preconditioner.GetInternalVariableNames();
       names.insert(names.end(), pre_names.begin(), pre_names.end());
     }
@@ -273,7 +267,7 @@ class BiCGSTABSolver {
         this, pmesh, i, params_.max_iters, presidual_tolerance);
     region.AddGlobalDependencies(reg_dep_id, i, check);
     reg_dep_id++;
-    
+
     return tl.AddTask(check, CopyData<x, u>, md);
   }
 
@@ -282,10 +276,9 @@ class BiCGSTABSolver {
 
   Real GetFinalResidual() const { return final_residual; }
   int GetFinalIterations() const { return final_iteration; }
-  
-  void UpdateResidualTolerance(Real *ptol) { 
-    presidual_tolerance = ptol;
-  }
+
+  void UpdateResidualTolerance(Real *ptol) { presidual_tolerance = ptol; }
+
  protected:
   MGSolver<u, rhs, equations> preconditioner;
   BiCGSTABParams params_;
diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 8ccab33f42e5..d2de65e355c6 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -45,33 +45,33 @@ class MGSolver {
   INTERNALSOLVERVARIABLE(u, temp);    // Temporary storage
   INTERNALSOLVERVARIABLE(u, u0);      // Storage for initial solution during FAS
   INTERNALSOLVERVARIABLE(u, D);       // Storage for (approximate) diagonal
-  std::vector<std::string> GetInternalVariableNames() const { 
+  std::vector<std::string> GetInternalVariableNames() const {
     return {res_err::name(), temp::name(), u0::name(), D::name()};
   }
   MGSolver(StateDescriptor *pkg, MGParams params_in, equations eq_in = equations(),
            std::vector<int> shape = {})
       : params_(params_in), iter_counter(0), eqs_(eq_in) {
     using namespace parthenon::refinement_ops;
-    // The ghost cells of res_err need to be filled, but this is accomplished by 
-    // copying res_err into u, communicating, then copying u back into res_err 
+    // The ghost cells of res_err need to be filled, but this is accomplished by
+    // copying res_err into u, communicating, then copying u back into res_err
     // across all zones in a block
     auto mres_err =
-        Metadata({Metadata::Cell, Metadata::Independent,
-                  Metadata::GMGRestrict, Metadata::GMGProlongate, Metadata::OneCopy},
+        Metadata({Metadata::Cell, Metadata::Independent, Metadata::GMGRestrict,
+                  Metadata::GMGProlongate, Metadata::OneCopy},
                  shape);
     mres_err.RegisterRefinementOps<ProlongateSharedLinear, RestrictAverage>();
     pkg->AddField(res_err::name(), mres_err);
 
-    auto mtemp = Metadata({Metadata::Cell, Metadata::Independent, Metadata::OneCopy},
-                          shape);
+    auto mtemp =
+        Metadata({Metadata::Cell, Metadata::Independent, Metadata::OneCopy}, shape);
     mtemp.RegisterRefinementOps<ProlongateSharedLinear, RestrictAverage>();
     pkg->AddField(temp::name(), mtemp);
 
     auto mu0 = Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, shape);
     pkg->AddField(u0::name(), mu0);
     auto Dshape = shape;
-    if (params_.two_by_two_diagonal) { 
-       Dshape = std::vector<int>{4};
+    if (params_.two_by_two_diagonal) {
+      Dshape = std::vector<int>{4};
     }
     auto mD = Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, Dshape);
     pkg->AddField(D::name(), mD);
@@ -150,7 +150,6 @@ class MGSolver {
   // These functions apparently have to be public to compile with cuda since
   // they contain device side lambdas
  public:
-
   template <class rhs_t, class Axold_t, class D_t, class xold_t, class xnew_t>
   TaskStatus Jacobi(std::shared_ptr<MeshData<Real>> &md, double weight) {
     using namespace parthenon;
@@ -169,35 +168,39 @@ class MGSolver {
     auto pack = desc.GetPack(md.get(), include_block);
     if (params_.two_by_two_diagonal) {
       parthenon::par_for(
-          DEFAULT_LOOP_PATTERN, "CaclulateFluxes", DevExecSpace(), 0, pack.GetNBlocks() - 1,
-          kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+          DEFAULT_LOOP_PATTERN, "CaclulateFluxes", DevExecSpace(), 0,
+          pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
           KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
             const auto &coords = pack.GetCoordinates(b);
 
-            const Real D11 = pack(b, te, D_t(0), k, j, i); 
-            const Real D22 = pack(b, te, D_t(1), k, j, i); 
-            const Real D12 = pack(b, te, D_t(2), k, j, i); 
-            const Real D21 = pack(b, te, D_t(3), k, j, i); 
+            const Real D11 = pack(b, te, D_t(0), k, j, i);
+            const Real D22 = pack(b, te, D_t(1), k, j, i);
+            const Real D12 = pack(b, te, D_t(2), k, j, i);
+            const Real D21 = pack(b, te, D_t(3), k, j, i);
             const Real det = D11 * D22 - D12 * D21;
-            
-            const Real Du0 = D11 * pack(b, te, xold_t(0), k, j, i)
-                           + D12 * pack(b, te, xold_t(1), k, j, i);
-            const Real Du1 = D21 * pack(b, te, xold_t(0), k, j, i)
-                           + D22 * pack(b, te, xold_t(1), k, j, i);
-
-            const Real t0 = pack(b, te, rhs_t(0), k, j, i) - pack(b, te, Axold_t(0), k, j, i) + Du0;  
-            const Real t1 = pack(b, te, rhs_t(1), k, j, i) - pack(b, te, Axold_t(1), k, j, i) + Du1;
-            
+
+            const Real Du0 = D11 * pack(b, te, xold_t(0), k, j, i) +
+                             D12 * pack(b, te, xold_t(1), k, j, i);
+            const Real Du1 = D21 * pack(b, te, xold_t(0), k, j, i) +
+                             D22 * pack(b, te, xold_t(1), k, j, i);
+
+            const Real t0 =
+                pack(b, te, rhs_t(0), k, j, i) - pack(b, te, Axold_t(0), k, j, i) + Du0;
+            const Real t1 =
+                pack(b, te, rhs_t(1), k, j, i) - pack(b, te, Axold_t(1), k, j, i) + Du1;
+
             const Real v0 = (D22 * t0 - D12 * t1) / det;
             const Real v1 = (-D21 * t0 + D11 * t1) / det;
-            
-            pack(b, te, xnew_t(0), k, j, i) = weight * v0 + (1.0 - weight) * pack(b, te, xold_t(0), k, j, i); 
-            pack(b, te, xnew_t(1), k, j, i) = weight * v1 + (1.0 - weight) * pack(b, te, xold_t(1), k, j, i); 
+
+            pack(b, te, xnew_t(0), k, j, i) =
+                weight * v0 + (1.0 - weight) * pack(b, te, xold_t(0), k, j, i);
+            pack(b, te, xnew_t(1), k, j, i) =
+                weight * v1 + (1.0 - weight) * pack(b, te, xold_t(1), k, j, i);
           });
-     } else { 
+    } else {
       parthenon::par_for(
-          DEFAULT_LOOP_PATTERN, "CaclulateFluxes", DevExecSpace(), 0, pack.GetNBlocks() - 1,
-          kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+          DEFAULT_LOOP_PATTERN, "CaclulateFluxes", DevExecSpace(), 0,
+          pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
           KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
             const auto &coords = pack.GetCoordinates(b);
 
@@ -216,8 +219,8 @@ class MGSolver {
                   weight * val / diag_elem +
                   (1.0 - weight) * pack(b, te, xold_t(c), k, j, i);
             }
-          }); 
-      }
+          });
+    }
     return TaskStatus::complete;
   }
 
@@ -228,7 +231,8 @@ class MGSolver {
 
     auto comm = AddBoundaryExchangeTasks<comm_boundary>(depends_on, tl, md, multilevel);
     auto mat_mult = eqs_.template Ax<in_t, out_t>(tl, comm, md);
-    return tl.AddTask(mat_mult, &MGSolver::Jacobi<rhs, out_t, D, in_t, out_t>, this, md, omega);
+    return tl.AddTask(mat_mult, &MGSolver::Jacobi<rhs, out_t, D, in_t, out_t>, this, md,
+                      omega);
   }
 
   template <parthenon::BoundaryType comm_boundary, class TL_t>
@@ -255,7 +259,7 @@ class MGSolver {
     auto jacobi1 = AddJacobiIteration<comm_boundary, u, temp>(tl, depends_on, multilevel,
                                                               omega[ndim - 1][0], md);
     auto copy1 = tl.AddTask(jacobi1, CopyData<temp, u, true>, md);
-    if (stages < 2) return copy1; 
+    if (stages < 2) return copy1;
     auto jacobi2 = AddJacobiIteration<comm_boundary, u, temp>(tl, copy1, multilevel,
                                                               omega[ndim - 1][1], md);
     auto copy2 = tl.AddTask(jacobi2, CopyData<temp, u, true>, md);
@@ -310,10 +314,10 @@ class MGSolver {
         auto copy_rhs = tl.AddTask(set_from_finer, CopyData<res_err, rhs, true>, md);
         set_from_finer = zero_u | copy_rhs;
       } else {
-        // TODO(LFR): Determine if this boundary exchange task is required, I think it is 
-        // to make sure that the boundaries of the restricted u are up to date before 
-        // calling Ax. That being said, at least in one case commenting this line out 
-        // didn't seem to impact the solution. 
+        // TODO(LFR): Determine if this boundary exchange task is required, I think it is
+        // to make sure that the boundaries of the restricted u are up to date before
+        // calling Ax. That being said, at least in one case commenting this line out
+        // didn't seem to impact the solution.
         set_from_finer = AddBoundaryExchangeTasks<BoundaryType::gmg_same>(
             set_from_finer, tl, md, multilevel);
         set_from_finer = tl.AddTask(set_from_finer, CopyData<u, u0, true>, md);
@@ -389,7 +393,8 @@ class MGSolver {
                                    md, 1.0, -1.0);
         copy_over = calc_err;
       }
-      // This is required to make sure boundaries of res_err are up to date before prolongation
+      // This is required to make sure boundaries of res_err are up to date before
+      // prolongation
       copy_over = tl.AddTask(copy_over, CopyData<res_err, u, true>, md);
       auto boundary =
           AddBoundaryExchangeTasks<BoundaryType::gmg_same>(copy_over, tl, md, multilevel);

From 1a8b49c3a2307782466b7df4220b6495cd3c3a82 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 7 Nov 2023 19:44:17 -0700
Subject: [PATCH 16/39] fix bug

---
 src/solvers/mg_solver.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index d2de65e355c6..aa2a93dc77fd 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -395,10 +395,12 @@ class MGSolver {
       }
       // This is required to make sure boundaries of res_err are up to date before
       // prolongation
-      copy_over = tl.AddTask(copy_over, CopyData<res_err, u, true>, md);
+      copy_over = tl.AddTask(copy_over, CopyData<u, temp, false>, md);
+      copy_over = tl.AddTask(copy_over, CopyData<res_err, u, false>, md);
       auto boundary =
           AddBoundaryExchangeTasks<BoundaryType::gmg_same>(copy_over, tl, md, multilevel);
       auto copy_back = tl.AddTask(boundary, CopyData<u, res_err, true>, md);
+      copy_back = tl.AddTask(copy_back, CopyData<temp, u, false>, md);
       last_task =
           tl.AddTask(copy_back, SendBoundBufs<BoundaryType::gmg_prolongate_send>, md);
     }

From 3a4e0949a29db704d2f9a4c64d6029d6dfea4b97 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 7 Nov 2023 19:49:59 -0700
Subject: [PATCH 17/39] format and lint

---
 example/poisson_gmg/poisson_package.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/example/poisson_gmg/poisson_package.cpp b/example/poisson_gmg/poisson_package.cpp
index a932ce4aaa5e..9c4d3ef27fe1 100644
--- a/example/poisson_gmg/poisson_package.cpp
+++ b/example/poisson_gmg/poisson_package.cpp
@@ -147,6 +147,7 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin) {
   bicgstab_params.max_iters = max_poisson_iterations;
   bicgstab_params.residual_tolerance = res_tol;
   bicgstab_params.precondition = precondition;
+  bicgstab_params.print_per_step = true;
   parthenon::solvers::BiCGSTABSolver<u, rhs, PoissonEquation> bicg_solver(
       pkg.get(), bicgstab_params, eq);
   pkg->AddParam<>("MGBiCGSTABsolver", bicg_solver,

From a2e903aa3584bd496a4c3c803a5c3b5907fe59d9 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Thu, 16 Nov 2023 19:06:24 -0700
Subject: [PATCH 18/39] fix bug introduced by merge

---
 src/solvers/bicgstab_solver.hpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index 63480baa900a..5d74a891716a 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -47,17 +47,17 @@ struct BiCGSTABParams {
 //
 // that takes a field associated with x_t and applies
 // the matrix A to it and stores the result in y_t.
-template <class x, class rhs, class equations>
+template <class u, class rhs, class equations>
 class BiCGSTABSolver {
  public:
-  PARTHENON_INTERNALSOLVERVARIABLE(x, rhat0);
-  PARTHENON_INTERNALSOLVERVARIABLE(x, v);
-  PARTHENON_INTERNALSOLVERVARIABLE(x, h);
-  PARTHENON_INTERNALSOLVERVARIABLE(x, s);
-  PARTHENON_INTERNALSOLVERVARIABLE(x, t);
-  PARTHENON_INTERNALSOLVERVARIABLE(x, r);
-  PARTHENON_INTERNALSOLVERVARIABLE(x, p);
-  PARTHENON_INTERNALSOLVERVARIABLE(x, u);
+  PARTHENON_INTERNALSOLVERVARIABLE(u, rhat0);
+  PARTHENON_INTERNALSOLVERVARIABLE(u, v);
+  PARTHENON_INTERNALSOLVERVARIABLE(u, h);
+  PARTHENON_INTERNALSOLVERVARIABLE(u, s);
+  PARTHENON_INTERNALSOLVERVARIABLE(u, t);
+  PARTHENON_INTERNALSOLVERVARIABLE(u, r);
+  PARTHENON_INTERNALSOLVERVARIABLE(u, p);
+  PARTHENON_INTERNALSOLVERVARIABLE(u, x);
   
   std::vector<std::string> GetInternalVariableNames() const {
     std::vector<std::string> names{rhat0::name(), v::name(), h::name(), s::name(),

From b8f3c97d55d59a9c1c9761a20083a799ef293a44 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Fri, 17 Nov 2023 13:13:15 -0700
Subject: [PATCH 19/39] Try to speed up TaskID processing

---
 src/tasks/task_id.cpp    | 17 +++++++++++------
 src/tasks/task_id.hpp    |  8 +++++---
 src/tasks/task_list.hpp  | 12 ++++--------
 src/tasks/task_types.hpp |  4 ++--
 4 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/tasks/task_id.cpp b/src/tasks/task_id.cpp
index 70bacbea8f36..a52f1e3419bc 100644
--- a/src/tasks/task_id.cpp
+++ b/src/tasks/task_id.cpp
@@ -17,6 +17,7 @@
 //! \file tasks.cpp
 //  \brief implementation of the TaskID class
 
+#include "utils/error_checking.hpp"
 #include "tasks/task_id.hpp"
 
 #include <algorithm>
@@ -40,12 +41,14 @@ void TaskID::Set(int id) {
   const int n_myblocks = id / BITBLOCK + 1;
   // grow if necessary.  never shrink
   if (n_myblocks > bitblocks.size()) bitblocks.resize(n_myblocks);
-  bitblocks[n_myblocks - 1].set(id % BITBLOCK);
+  bitblocks[n_myblocks - 1] |= (static_cast<uint64_t>(1) << (id % BITBLOCK));
+  bit = id; 
+  nbits_set++;
 }
 
 void TaskID::clear() {
   for (auto &bset : bitblocks) {
-    bset.reset();
+    bset = 0;
   }
 }
 
@@ -65,7 +68,7 @@ bool TaskID::CheckDependencies(const TaskID &rhs) const {
       if ((bitblocks[i] & rhs.bitblocks[i]) != rhs.bitblocks[i]) return false;
     }
     for (int i = n_myblocks; i < n_srcblocks; i++) {
-      if (rhs.bitblocks[i].any()) return false;
+      if (rhs.bitblocks[i] > 0) return false;
     }
   }
   return true;
@@ -93,6 +96,8 @@ void TaskID::SetFinished(const TaskID &rhs) {
 }
 
 bool TaskID::operator==(const TaskID &rhs) const {
+  if (nbits_set != rhs.nbits_set) return false;
+  
   const int n_myblocks = bitblocks.size();
   const int n_srcblocks = rhs.bitblocks.size();
   if (n_myblocks == n_srcblocks) {
@@ -104,14 +109,14 @@ bool TaskID::operator==(const TaskID &rhs) const {
       if (bitblocks[i] != rhs.bitblocks[i]) return false;
     }
     for (int i = n_srcblocks; i < n_myblocks; i++) {
-      if (bitblocks[i].any()) return false;
+      if (bitblocks[i] > 0) return false;
     }
   } else {
     for (int i = 0; i < n_myblocks; i++) {
       if (bitblocks[i] != rhs.bitblocks[i]) return false;
     }
     for (int i = n_myblocks; i < n_srcblocks; i++) {
-      if (rhs.bitblocks[i].any()) return false;
+      if (rhs.bitblocks[i] > 0) return false;
     }
   }
   return true;
@@ -149,7 +154,7 @@ TaskID TaskID::operator|(const TaskID &rhs) const {
 std::string TaskID::to_string() const {
   std::string bs;
   for (int i = bitblocks.size() - 1; i >= 0; i--) {
-    bs += bitblocks[i].to_string();
+    //bs += bitblocks[i].to_string();
   }
   return bs;
 }
diff --git a/src/tasks/task_id.hpp b/src/tasks/task_id.hpp
index 54043001af66..942cf0a4e5ed 100644
--- a/src/tasks/task_id.hpp
+++ b/src/tasks/task_id.hpp
@@ -26,11 +26,11 @@ namespace parthenon {
 //! \class TaskID
 //  \brief generalization of bit fields for Task IDs, status, and dependencies.
 
-#define BITBLOCK 16
+#define BITBLOCK 64
 
 class TaskID {
  public:
-  TaskID() { Set(0); }
+  TaskID() : nbits_set(0), bit(-1) { Set(0); }
   explicit TaskID(int id);
 
   void Set(int id);
@@ -43,7 +43,9 @@ class TaskID {
   std::string to_string() const;
 
  private:
-  std::vector<std::bitset<BITBLOCK>> bitblocks;
+  int nbits_set; 
+  int bit;
+  std::vector<uint64_t> bitblocks;
 };
 
 } // namespace parthenon
diff --git a/src/tasks/task_list.hpp b/src/tasks/task_list.hpp
index 322d1a788d70..47ae93567b23 100644
--- a/src/tasks/task_list.hpp
+++ b/src/tasks/task_list.hpp
@@ -139,7 +139,7 @@ class TaskList {
   bool CheckDependencies(const TaskID &id) const {
     return tasks_completed_.CheckDependencies(id);
   }
-  bool CheckTaskRan(TaskID id) const {
+  bool CheckTaskRan(const TaskID &id) const {
     for (auto &task : task_list_) {
       if (task.GetID() == id) {
         return (task.GetStatus() != TaskStatus::incomplete &&
@@ -258,7 +258,7 @@ class TaskList {
         ++task;
         continue;
       }
-      auto dep = task->GetDependency();
+      const auto &dep = task->GetDependency();
       if (CheckDependencies(dep)) {
         (*task)();
         if (task->GetStatus() == TaskStatus::complete && !task->IsRegional()) {
@@ -475,9 +475,7 @@ class TaskRegion {
     auto &lvec = id_for_reg[reg_id];
     int n_to_run = lvec.size();
     int n_ran = 0;
-    for (auto &pair : lvec) {
-      int list_index = pair.first;
-      TaskID id = pair.second;
+    for (auto &[list_index, id] : lvec) {
       if (lists[list_index].CheckTaskRan(id)) {
         n_ran++;
       }
@@ -488,9 +486,7 @@ class TaskRegion {
     auto &lvec = id_for_reg[reg_id];
     int n_to_finish = lvec.size();
     int n_finished = 0;
-    for (auto &pair : lvec) {
-      int list_index = pair.first;
-      TaskID id = pair.second;
+    for (auto &[list_index, id] : lvec) {
       if (lists[list_index].CheckTaskCompletion(id)) {
         n_finished++;
       }
diff --git a/src/tasks/task_types.hpp b/src/tasks/task_types.hpp
index c3475784e50a..414169450980 100644
--- a/src/tasks/task_types.hpp
+++ b/src/tasks/task_types.hpp
@@ -71,8 +71,8 @@ class Task {
     }
   }
   void SetID(const TaskID &id) { myid_ = id; }
-  TaskID GetID() const { return myid_; }
-  TaskID GetDependency() const { return dep_; }
+  const TaskID &GetID() const { return myid_; }
+  const TaskID &GetDependency() const { return dep_; }
   TaskStatus GetStatus() const { return status_; }
   void SetStatus(const TaskStatus &status) { status_ = status; }
   TaskType GetType() const { return type_; }

From 8d2a64003fd80f3907d3e262d5f1ac277a9a123e Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Fri, 17 Nov 2023 13:46:26 -0700
Subject: [PATCH 20/39] Update MeshData::Initialize to work with MG block lists

---
 src/interface/mesh_data.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/interface/mesh_data.cpp b/src/interface/mesh_data.cpp
index 9b808b738a87..bb1a532ef6c8 100644
--- a/src/interface/mesh_data.cpp
+++ b/src/interface/mesh_data.cpp
@@ -25,9 +25,19 @@ void MeshData<T>::Initialize(const MeshData<T> *src,
   pmy_mesh_ = src->GetParentPointer();
   const int nblocks = src->NumBlocks();
   block_data_.resize(nblocks);
-  for (int i = 0; i < nblocks; i++) {
-    block_data_[i] = pmy_mesh_->block_list[i]->meshblock_data.Add(
-        stage_name_, src->GetBlockData(i), names, shallow);
+  
+  grid = src->grid;
+  if (grid.type == GridType::two_level_composite) { 
+    int gmg_level = src->grid.logical_level - pmy_mesh_->GetGMGMinLogicalLevel();
+    for (int i = 0; i < nblocks; i++) {
+      block_data_[i] = pmy_mesh_->gmg_block_lists[gmg_level][i]->meshblock_data.Add(
+          stage_name_, src->GetBlockData(i), names, shallow);
+    }
+  } else {
+    for (int i = 0; i < nblocks; i++) {
+      block_data_[i] = pmy_mesh_->block_list[i]->meshblock_data.Add(
+          stage_name_, src->GetBlockData(i), names, shallow);
+    }
   }
 }
 

From 3af1e4777be62ae4a481c40f7d07e32813dbd33c Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Fri, 17 Nov 2023 13:46:49 -0700
Subject: [PATCH 21/39] Minimize set of fields that are communicated

---
 src/solvers/bicgstab_solver.hpp |  6 +++
 src/solvers/mg_solver.hpp       | 79 ++++++++++++++++++++++++++-------
 2 files changed, 69 insertions(+), 16 deletions(-)

diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index 5d74a891716a..d90f99f116f5 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -86,6 +86,12 @@ class BiCGSTABSolver {
     pkg->AddField(p::name(), m_no_ghost);
   }
 
+  template <class TL_t>
+  TaskID AddSetupTasks(TaskRegion &region, TL_t &tl, TaskID dependence,
+                                int partition, int &reg_dep_id, Mesh *pmesh) { 
+    return preconditioner.AddSetupTasks(region, tl, dependence, partition, reg_dep_id, pmesh);
+  }
+
   TaskID AddTasks(TaskList &tl, IterativeTasks &itl, TaskID dependence, int i,
                   Mesh *pmesh, TaskRegion &region, int &reg_dep_id) {
     using namespace utils;
diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 1ae07c86c28d..dca2ad840c6f 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -150,6 +150,18 @@ class MGSolver {
     return AddMultiGridTasksPartitionLevel(region, tl, dependence, partition, reg_dep_id,
                                            max_level, min_level, max_level, pmesh);
   }
+  
+  template <class TL_t>
+  TaskID AddSetupTasks(TaskRegion &region, TL_t &tl, TaskID dependence,
+                                int partition, int &reg_dep_id, Mesh *pmesh) {
+    using namespace utils;
+
+    int min_level = 0;
+    int max_level = pmesh->GetGMGMaxLevel();
+
+    return AddMultiGridSetupPartitionLevel(region, tl, dependence, partition, reg_dep_id,
+                                           max_level, min_level, max_level, pmesh);
+  }
 
   Real GetSquaredResidualSum() const { return residual.val; }
   int GetCurrentIterations() const { return iter_counter; }
@@ -242,10 +254,11 @@ class MGSolver {
 
   template <parthenon::BoundaryType comm_boundary, class in_t, class out_t, class TL_t>
   TaskID AddJacobiIteration(TL_t &tl, TaskID depends_on, bool multilevel, Real omega,
-                            std::shared_ptr<MeshData<Real>> &md) {
+                            std::shared_ptr<MeshData<Real>> &md,
+                            std::shared_ptr<MeshData<Real>> &md_comm) {
     using namespace utils;
 
-    auto comm = AddBoundaryExchangeTasks<comm_boundary>(depends_on, tl, md, multilevel);
+    auto comm = AddBoundaryExchangeTasks<comm_boundary>(depends_on, tl, md_comm, multilevel);
     auto mat_mult = eqs_.template Ax<in_t, out_t>(tl, comm, md);
     return tl.AddTask(mat_mult, &MGSolver::Jacobi<rhs, out_t, D, in_t, out_t>, this, md,
                       omega);
@@ -253,7 +266,8 @@ class MGSolver {
 
   template <parthenon::BoundaryType comm_boundary, class TL_t>
   TaskID AddSRJIteration(TL_t &tl, TaskID depends_on, int stages, bool multilevel,
-                         std::shared_ptr<MeshData<Real>> &md) {
+                         std::shared_ptr<MeshData<Real>> &md,
+                         std::shared_ptr<MeshData<Real>> &md_comm) {
     using namespace utils;
     int ndim = md->GetParentPointer()->ndim;
 
@@ -273,18 +287,49 @@ class MGSolver {
     // fine-coarse boundaries of temp are correctly updated during communication
     depends_on = tl.AddTask(depends_on, CopyData<u, temp, false>, md);
     auto jacobi1 = AddJacobiIteration<comm_boundary, u, temp>(tl, depends_on, multilevel,
-                                                              omega[ndim - 1][0], md);
+                                                              omega[ndim - 1][0], md, md_comm);
     auto copy1 = tl.AddTask(jacobi1, CopyData<temp, u, true>, md);
     if (stages < 2) return copy1;
     auto jacobi2 = AddJacobiIteration<comm_boundary, u, temp>(tl, copy1, multilevel,
-                                                              omega[ndim - 1][1], md);
+                                                              omega[ndim - 1][1], md, md_comm);
     auto copy2 = tl.AddTask(jacobi2, CopyData<temp, u, true>, md);
     if (stages < 3) return copy2;
     auto jacobi3 = AddJacobiIteration<comm_boundary, u, temp>(tl, copy2, multilevel,
-                                                              omega[ndim - 1][2], md);
+                                                              omega[ndim - 1][2], md, md_comm);
     return tl.AddTask(jacobi3, CopyData<temp, u, true>, md);
   }
 
+  template <class TL_t>
+  TaskID AddMultiGridSetupPartitionLevel(TaskRegion &region, TL_t &tl, TaskID dependence,
+                                         int partition, int &reg_dep_id, int level,
+                                         int min_level, int max_level, Mesh *pmesh) {
+    using namespace utils;
+
+    bool multilevel = (level != min_level);
+
+    auto &md = pmesh->gmg_mesh_data[level].GetOrAdd(level, "base", partition);
+
+    auto task_out = dependence;
+    if (level < max_level) {
+      task_out =
+          tl.AddTask(task_out, ReceiveBoundBufs<BoundaryType::gmg_restrict_recv>, md);
+      task_out =
+          tl.AddTask(task_out, SetBounds<BoundaryType::gmg_restrict_recv>, md);
+    }
+
+    // If we are finer than the coarsest level:
+    if (level > min_level) {
+      task_out =
+          tl.AddTask(task_out, SendBoundBufs<BoundaryType::gmg_restrict_send>, md);
+      task_out = AddMultiGridSetupPartitionLevel(region, tl, task_out,
+                                                 partition, reg_dep_id, level - 1,
+                                                 min_level, max_level, pmesh);
+    }
+
+    // The boundaries are not up to date on return
+    return task_out;
+  }
+
   template <class TL_t>
   TaskID AddMultiGridTasksPartitionLevel(TaskRegion &region, TL_t &tl, TaskID dependence,
                                          int partition, int &reg_dep_id, int level,
@@ -312,15 +357,17 @@ class MGSolver {
     bool multilevel = (level != min_level);
 
     auto &md = pmesh->gmg_mesh_data[level].GetOrAdd(level, "base", partition);
+    std::string label = "comm_" + std::to_string(level) + "_" + std::to_string(partition);
+    auto &md_comm = pmesh->gmg_mesh_data[level].AddShallow(label, md, std::vector<std::string>{u::name(), res_err::name()});
 
     // 0. Receive residual from coarser level if there is one
     auto set_from_finer = dependence;
     if (level < max_level) {
       // Fill fields with restricted values
       auto recv_from_finer =
-          tl.AddTask(dependence, ReceiveBoundBufs<BoundaryType::gmg_restrict_recv>, md);
+          tl.AddTask(dependence, ReceiveBoundBufs<BoundaryType::gmg_restrict_recv>, md_comm);
       set_from_finer =
-          tl.AddTask(recv_from_finer, SetBounds<BoundaryType::gmg_restrict_recv>, md);
+          tl.AddTask(recv_from_finer, SetBounds<BoundaryType::gmg_restrict_recv>, md_comm);
       region.AddRegionalDependencies(reg_dep_id, partition, set_from_finer);
       reg_dep_id++;
       // 1. Copy residual from dual purpose communication field to the rhs, should be
@@ -335,7 +382,7 @@ class MGSolver {
         // calling Ax. That being said, at least in one case commenting this line out
         // didn't seem to impact the solution.
         set_from_finer = AddBoundaryExchangeTasks<BoundaryType::gmg_same>(
-            set_from_finer, tl, md, multilevel);
+            set_from_finer, tl, md_comm, multilevel);
         set_from_finer = tl.AddTask(set_from_finer, CopyData<u, u0, true>, md);
         // This should set the rhs only in blocks that correspond to interior nodes, the
         // RHS of leaf blocks that are on this GMG level should have already been set on
@@ -353,12 +400,12 @@ class MGSolver {
     set_from_finer =
         tl.AddTask(set_from_finer, &equations::template SetDiagonal<D>, &eqs_, md);
     auto pre_smooth = AddSRJIteration<BoundaryType::gmg_same>(tl, set_from_finer,
-                                                              pre_stages, multilevel, md);
+                                                              pre_stages, multilevel, md, md_comm);
     // If we are finer than the coarsest level:
     auto post_smooth = pre_smooth;
     if (level > min_level) {
       // 3. Communicate same level boundaries so that u is up to date everywhere
-      auto comm_u = AddBoundaryExchangeTasks<BoundaryType::gmg_same>(pre_smooth, tl, md,
+      auto comm_u = AddBoundaryExchangeTasks<BoundaryType::gmg_same>(pre_smooth, tl, md_comm,
                                                                      multilevel);
 
       // 4. Caclulate residual and store in communication field
@@ -369,7 +416,7 @@ class MGSolver {
 
       // 5. Restrict communication field and send to next level
       auto communicate_to_coarse =
-          tl.AddTask(residual, SendBoundBufs<BoundaryType::gmg_restrict_send>, md);
+          tl.AddTask(residual, SendBoundBufs<BoundaryType::gmg_restrict_send>, md_comm);
 
       auto coarser = AddMultiGridTasksPartitionLevel(region, tl, communicate_to_coarse,
                                                      partition, reg_dep_id, level - 1,
@@ -377,11 +424,11 @@ class MGSolver {
 
       // 6. Receive error field into communication field and prolongate
       auto recv_from_coarser =
-          tl.AddTask(coarser, ReceiveBoundBufs<BoundaryType::gmg_prolongate_recv>, md);
+          tl.AddTask(coarser, ReceiveBoundBufs<BoundaryType::gmg_prolongate_recv>, md_comm);
       auto set_from_coarser =
-          tl.AddTask(recv_from_coarser, SetBounds<BoundaryType::gmg_prolongate_recv>, md);
+          tl.AddTask(recv_from_coarser, SetBounds<BoundaryType::gmg_prolongate_recv>, md_comm);
       auto prolongate = tl.AddTask(
-          set_from_coarser, ProlongateBounds<BoundaryType::gmg_prolongate_recv>, md);
+          set_from_coarser, ProlongateBounds<BoundaryType::gmg_prolongate_recv>, md_comm);
       region.AddRegionalDependencies(reg_dep_id, partition, prolongate);
       reg_dep_id++;
 
@@ -392,7 +439,7 @@ class MGSolver {
 
       // 8. Post smooth using communication field and stored RHS
       post_smooth = AddSRJIteration<BoundaryType::gmg_same>(tl, update_sol, post_stages,
-                                                            multilevel, md);
+                                                            multilevel, md, md_comm);
     } else {
       post_smooth = tl.AddTask(pre_smooth, CopyData<u, res_err, true>, md);
     }

From 40f0f6aa46c09db0b79fa04105b6c6b826c7707f Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 28 Nov 2023 11:48:15 -0700
Subject: [PATCH 22/39] Add global min task

---
 src/solvers/solver_utils.hpp | 57 ++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/src/solvers/solver_utils.hpp b/src/solvers/solver_utils.hpp
index 9aaf657eebac..29d301135ba6 100644
--- a/src/solvers/solver_utils.hpp
+++ b/src/solvers/solver_utils.hpp
@@ -300,6 +300,63 @@ TaskID DotProduct(TaskID dependency_in, TaskRegion &region, TL_t &tl, int partit
   return finish_global_adotb;
 }
 
+template <class a_t>
+TaskStatus GlobalMinLocal(const std::shared_ptr<MeshData<Real>> &md,
+                           AllReduce<Real> *amin) {
+  using TE = parthenon::TopologicalElement;
+  TE te = TE::CC;
+  IndexRange ib = md->GetBoundsI(IndexDomain::interior, te);
+  IndexRange jb = md->GetBoundsJ(IndexDomain::interior, te);
+  IndexRange kb = md->GetBoundsK(IndexDomain::interior, te);
+
+  static auto desc = parthenon::MakePackDescriptor<a_t>(md.get());
+  auto pack = desc.GetPack(md.get());
+  Real gmin(0);
+  parthenon::par_reduce(
+      parthenon::loop_pattern_mdrange_tag, "DotProduct", DevExecSpace(), 0,
+      pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lmin) {
+        const int nvars = pack.GetUpperBound(b, a_t()) - pack.GetLowerBound(b, a_t()) + 1;
+        // TODO(LFR): If this becomes a bottleneck, exploit hierarchical parallelism and
+        //            pull the loop over vars outside of the innermost loop to promote
+        //            vectorization.
+        for (int c = 0; c < nvars; ++c)
+          lmin = std::min(lmin, pack(b, te, a_t(c), k, j, i));
+      },
+      Kokkos::Min<Real>(gmin));
+  amin->val = std::min(gmin, amin->val);
+  return TaskStatus::complete;
+}
+
+template <class a_t, class TL_t>
+TaskID GlobalMin(TaskID dependency_in, TaskRegion &region, TL_t &tl, int partition,
+                  int &reg_dep_id, AllReduce<Real> *amin,
+                  const std::shared_ptr<MeshData<Real>> &md) {
+  using namespace impl;
+  auto zero_amin = (partition == 0 ? tl.AddTask(
+                                          dependency_in,
+                                          [](AllReduce<Real> *r) {
+                                            r->val = std::numeric_limits<Real>::max();
+                                            return TaskStatus::complete;
+                                          },
+                                          amin)
+                                    : dependency_in);
+  region.AddRegionalDependencies(reg_dep_id, partition, zero_amin);
+  reg_dep_id++;
+  auto get_amin = tl.AddTask(zero_amin, GlobalMinLocal<a_t>, md, amin);
+  region.AddRegionalDependencies(reg_dep_id, partition, get_amin);
+  reg_dep_id++;
+  auto start_global_amin =
+      (partition == 0
+           ? tl.AddTask(get_amin, &AllReduce<Real>::StartReduce, amin, MPI_MIN)
+           : get_amin);
+  auto finish_global_amin =
+      tl.AddTask(start_global_amin, &AllReduce<Real>::CheckReduce, amin);
+  region.AddRegionalDependencies(reg_dep_id, partition, finish_global_amin);
+  reg_dep_id++;
+  return finish_global_amin;
+}
+
 } // namespace utils
 
 } // namespace solvers

From b586114b2e9ee7c75829f472110f43629e2ddb73 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Thu, 21 Dec 2023 10:23:23 -0700
Subject: [PATCH 23/39] Add BiCGSTAB communicator

---
 src/solvers/bicgstab_solver.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index a976ad223043..45ff23032e8c 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -96,7 +96,11 @@ class BiCGSTABSolver {
                   Mesh *pmesh, TaskRegion &region, int &reg_dep_id) {
     using namespace utils;
     auto &md = pmesh->mesh_data.GetOrAdd("base", i);
+    std::string label = "bicg_comm_" + std::to_string(i);
+    auto &md_comm = pmesh->mesh_data.AddShallow(label, md, std::vector<std::string>{u::name()});
+
     iter_counter = 0;
+    bool multilevel = pmesh->multilevel;
 
     // Initialization: x <- 0, r <- rhs, rhat0 <- rhs,
     // rhat0r_old <- (rhat0, r), p <- r, u <- 0
@@ -144,7 +148,7 @@ class BiCGSTABSolver {
     }
 
     // 2. v <- A u
-    auto comm = AddBoundaryExchangeTasks<BoundaryType::any>(precon1, itl, md, true);
+    auto comm = AddBoundaryExchangeTasks<BoundaryType::any>(precon1, itl, md_comm, multilevel);
     auto get_v = eqs_.template Ax<u, v>(itl, comm, md);
 
     // 3. rhat0v <- (rhat0, v)
@@ -195,7 +199,7 @@ class BiCGSTABSolver {
     }
 
     // 7. t <- A u
-    auto pre_t_comm = AddBoundaryExchangeTasks<BoundaryType::any>(precon2, itl, md, true);
+    auto pre_t_comm = AddBoundaryExchangeTasks<BoundaryType::any>(precon2, itl, md_comm, multilevel);
     auto get_t = eqs_.template Ax<u, t>(itl, pre_t_comm, md);
 
     // 8. omega <- (t,s) / (t,t)

From ebabb470aca5f4c6a9209d8e969234beec5de2f5 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Thu, 21 Dec 2023 10:24:33 -0700
Subject: [PATCH 24/39] Speed up utility loops

---
 src/solvers/solver_utils.hpp | 55 +++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/src/solvers/solver_utils.hpp b/src/solvers/solver_utils.hpp
index 29d301135ba6..49092e906bca 100644
--- a/src/solvers/solver_utils.hpp
+++ b/src/solvers/solver_utils.hpp
@@ -146,7 +146,7 @@ struct Stencil {
 };
 
 namespace utils {
-template <class in, class out, bool only_fine_on_composite = true>
+template <class in_t, class out_t, bool only_fine_on_composite = true>
 TaskStatus CopyData(const std::shared_ptr<MeshData<Real>> &md) {
   using TE = parthenon::TopologicalElement;
   TE te = TE::CC;
@@ -154,23 +154,28 @@ TaskStatus CopyData(const std::shared_ptr<MeshData<Real>> &md) {
   IndexRange jb = md->GetBoundsJ(IndexDomain::entire, te);
   IndexRange kb = md->GetBoundsK(IndexDomain::entire, te);
 
-  static auto desc = parthenon::MakePackDescriptor<in, out>(md.get());
+  static auto desc = parthenon::MakePackDescriptor<in_t, out_t>(md.get());
   auto pack = desc.GetPack(md.get(), only_fine_on_composite);
-  parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "CopyData", DevExecSpace(), 0, pack.GetNBlocks() - 1, kb.s,
-      kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
-        // TODO(LFR): If this becomes a bottleneck, exploit hierarchical parallelism and
-        //            pull the loop over vars outside of the innermost loop to promote
-        //            vectorization.
-        const int nvars = pack.GetUpperBound(b, in()) - pack.GetLowerBound(b, in()) + 1;
-        for (int c = 0; c < nvars; ++c)
-          pack(b, te, out(c), k, j, i) = pack(b, te, in(c), k, j, i);
+  const int scratch_size = 0; 
+  const int scratch_level = 0;
+  parthenon::par_for_outer(
+      DEFAULT_OUTER_LOOP_PATTERN, "CopyData", DevExecSpace(), scratch_size, scratch_level, 
+      0, pack.GetNBlocks() - 1, 
+      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b) {
+        const int nvars = pack.GetUpperBound(b, in_t()) - pack.GetLowerBound(b, in_t()) + 1;
+        const int npoints = (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1); 
+        for (int c = 0; c < nvars; ++c) {
+          Real *in = &pack(b, te, in_t(c), kb.s, jb.s, ib.s);
+          Real *out = &pack(b, te, out_t(c), kb.s, jb.s, ib.s);
+          parthenon::par_for_inner(DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1, [&](const int idx) {
+            out[idx] = in[idx];
+          });
+        }
       });
   return TaskStatus::complete;
 }
 
-template <class a_t, class b_t, class out, bool only_fine_on_composite = true>
+template <class a_t, class b_t, class out_t, bool only_fine_on_composite = true>
 TaskStatus AddFieldsAndStoreInteriorSelect(const std::shared_ptr<MeshData<Real>> &md,
                                            Real wa = 1.0, Real wb = 1.0,
                                            bool only_interior_blocks = false) {
@@ -188,19 +193,23 @@ TaskStatus AddFieldsAndStoreInteriorSelect(const std::shared_ptr<MeshData<Real>>
       include_block[b] = md->GetBlockData(b)->GetBlockPointer()->neighbors.size() == 0;
   }
 
-  static auto desc = parthenon::MakePackDescriptor<a_t, b_t, out>(md.get());
+  static auto desc = parthenon::MakePackDescriptor<a_t, b_t, out_t>(md.get());
   auto pack = desc.GetPack(md.get(), include_block, only_fine_on_composite);
-  parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "AddFieldsAndStore", DevExecSpace(), 0, pack.GetNBlocks() - 1,
-      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
-        // TODO(LFR): If this becomes a bottleneck, exploit hierarchical parallelism and
-        //            pull the loop over vars outside of the innermost loop to promote
-        //            vectorization.
+  const int scratch_size = 0; 
+  const int scratch_level = 0;
+  parthenon::par_for_outer(
+      DEFAULT_OUTER_LOOP_PATTERN, "AddFieldsAndStore", DevExecSpace(), scratch_size, scratch_level, 
+      0, pack.GetNBlocks() - 1,
+      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b) {
         const int nvars = pack.GetUpperBound(b, a_t()) - pack.GetLowerBound(b, a_t()) + 1;
+        const int npoints = (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1);
         for (int c = 0; c < nvars; ++c) {
-          pack(b, te, out(c), k, j, i) =
-              wa * pack(b, te, a_t(c), k, j, i) + wb * pack(b, te, b_t(c), k, j, i);
+          Real *avar = &pack(b, te, a_t(c), kb.s, jb.s, ib.s);
+          Real *bvar = &pack(b, te, b_t(c), kb.s, jb.s, ib.s);
+          Real *out = &pack(b, te, out_t(c), kb.s, jb.s, ib.s);
+          parthenon::par_for_inner(DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1, [&](const int idx) {
+            out[idx] = wa * avar[idx] + wb * bvar[idx];
+          });
         }
       });
   return TaskStatus::complete;

From 676d6709cd09a35d7bbb93ba31508edae3c0db49 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Thu, 21 Dec 2023 10:25:55 -0700
Subject: [PATCH 25/39] Allow for specifying maximum number of coarsenings in
 MG and vectorize inner loop

---
 src/solvers/mg_solver.hpp | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index dca2ad840c6f..5e4c1b1ed534 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -36,6 +36,7 @@ struct MGParams {
   bool do_FAS = true;
   std::string smoother = "SRJ2";
   bool two_by_two_diagonal = false;
+  int max_coarsenings = std::numeric_limits<int>::max();
 };
 
 // The equations class must include a template method
@@ -144,7 +145,7 @@ class MGSolver {
     using namespace utils;
     iter_counter = 0;
 
-    int min_level = 0;
+    int min_level = std::max(pmesh->GetGMGMaxLevel() - params_.max_coarsenings, 0);
     int max_level = pmesh->GetGMGMaxLevel();
 
     return AddMultiGridTasksPartitionLevel(region, tl, dependence, partition, reg_dep_id,
@@ -226,26 +227,28 @@ class MGSolver {
                 weight * v1 + (1.0 - weight) * pack(b, te, xold_t(1), k, j, i);
           });
     } else {
-      parthenon::par_for(
-          DEFAULT_LOOP_PATTERN, "CaclulateFluxes", DevExecSpace(), 0,
-          pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-          KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
-            const auto &coords = pack.GetCoordinates(b);
-
+      const int scratch_size = 0; 
+      const int scratch_level = 0;
+      parthenon::par_for_outer(
+          DEFAULT_OUTER_LOOP_PATTERN, "Jacobi", DevExecSpace(), scratch_size, scratch_level, 
+          0, pack.GetNBlocks() - 1, kb.s, kb.e, 
+          KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) {
             const int nvars =
                 pack.GetUpperBound(b, xnew_t()) - pack.GetLowerBound(b, xnew_t()) + 1;
-
             for (int c = 0; c < nvars; ++c) {
-              Real diag_elem = pack(b, te, D_t(c), k, j, i);
-
-              // Get the off-diagonal contribution to Ax = (D + L + U)x = y
-              Real off_diag = pack(b, te, Axold_t(c), k, j, i) -
-                              diag_elem * pack(b, te, xold_t(c), k, j, i);
-
-              Real val = pack(b, te, rhs_t(c), k, j, i) - off_diag;
-              pack(b, te, xnew_t(c), k, j, i) =
-                  weight * val / diag_elem +
-                  (1.0 - weight) * pack(b, te, xold_t(c), k, j, i);
+              Real *Ax = &pack(b, te, Axold_t(c), k, jb.s, ib.s); 
+              Real *diag = &pack(b, te, D_t(c), k, jb.s, ib.s); 
+              Real *rhs = &pack(b, te, rhs_t(c), k, jb.s, ib.s);
+              Real *xo = &pack(b, te, xold_t(c), k, jb.s, ib.s);
+              Real *xn = &pack(b, te, xnew_t(c), k, jb.s, ib.s);
+              const int npoints = (jb.e - jb.s + 1) 
+                                * (ib.e - ib.s + 1 + 2 * Globals::nghost) 
+                                - 2 * Globals::nghost;
+              parthenon::par_for_inner(DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1, [&](const int idx) {
+                const Real off_diag = Ax[idx] - diag[idx] * xo[idx];
+                const Real val = rhs[idx] - off_diag; 
+                xn[idx] = weight * val / diag[idx] + (1.0 - weight) * xo[idx];
+              });
             }
           });
     }

From 217f26ee1eaff9b0b2ed0d5d510b3eeac69ca7bd Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Mon, 11 Mar 2024 14:16:15 -0600
Subject: [PATCH 26/39] small

---
 src/solvers/mg_solver.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 5e4c1b1ed534..feb39f9f635a 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -238,7 +238,7 @@ class MGSolver {
             for (int c = 0; c < nvars; ++c) {
               Real *Ax = &pack(b, te, Axold_t(c), k, jb.s, ib.s); 
               Real *diag = &pack(b, te, D_t(c), k, jb.s, ib.s); 
-              Real *rhs = &pack(b, te, rhs_t(c), k, jb.s, ib.s);
+              Real *prhs = &pack(b, te, rhs_t(c), k, jb.s, ib.s);
               Real *xo = &pack(b, te, xold_t(c), k, jb.s, ib.s);
               Real *xn = &pack(b, te, xnew_t(c), k, jb.s, ib.s);
               const int npoints = (jb.e - jb.s + 1) 
@@ -246,7 +246,7 @@ class MGSolver {
                                 - 2 * Globals::nghost;
               parthenon::par_for_inner(DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1, [&](const int idx) {
                 const Real off_diag = Ax[idx] - diag[idx] * xo[idx];
-                const Real val = rhs[idx] - off_diag; 
+                const Real val = prhs[idx] - off_diag; 
                 xn[idx] = weight * val / diag[idx] + (1.0 - weight) * xo[idx];
               });
             }

From ed2534aa4927c042ebfa27a2989be683db9b9624 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Mon, 11 Mar 2024 16:36:04 -0600
Subject: [PATCH 27/39] fix tests

---
 example/poisson_gmg/poisson_driver.cpp | 6 ++++--
 src/solvers/bicgstab_solver.hpp        | 5 +++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/example/poisson_gmg/poisson_driver.cpp b/example/poisson_gmg/poisson_driver.cpp
index 784653237413..42c43b7bde54 100644
--- a/example/poisson_gmg/poisson_driver.cpp
+++ b/example/poisson_gmg/poisson_driver.cpp
@@ -101,9 +101,11 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) {
     auto solve = zero_u;
     auto &itl = tl.AddIteration("Solver");
     if (solver == "BiCGSTAB") {
-      solve = bicgstab_solver->AddTasks(tl, itl, zero_u, i, pmesh, region, reg_dep_id);
+      auto setup = bicgstab_solver->AddSetupTasks(region, tl, zero_u, i, reg_dep_id, pmesh);
+      solve = bicgstab_solver->AddTasks(tl, itl, setup, i, pmesh, region, reg_dep_id);
     } else if (solver == "MG") {
-      solve = mg_solver->AddTasks(tl, itl, zero_u, i, pmesh, region, reg_dep_id);
+      auto setup = mg_solver->AddSetupTasks(region, tl, zero_u, i, reg_dep_id, pmesh);
+      solve = mg_solver->AddTasks(tl, itl, solve, i, pmesh, region, reg_dep_id);
     } else {
       PARTHENON_FAIL("Unknown solver type.");
     }
diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index 45ff23032e8c..aa261f5fbde4 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -72,7 +72,7 @@ class BiCGSTABSolver {
   BiCGSTABSolver(StateDescriptor *pkg, BiCGSTABParams params_in,
                  equations eq_in = equations(), std::vector<int> shape = {})
       : preconditioner(pkg, params_in.mg_params, eq_in, shape), params_(params_in),
-        iter_counter(0), eqs_(eq_in), presidual_tolerance(&params_in.residual_tolerance) {
+        iter_counter(0), eqs_(eq_in), presidual_tolerance(nullptr) {
     using namespace refinement_ops;
     auto m_no_ghost =
         Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy}, shape);
@@ -260,6 +260,7 @@ class BiCGSTABSolver {
 
     // 14. rhat0r_old <- rhat0r, zero all reductions
     region.AddRegionalDependencies(reg_dep_id, i, update_p | correct_x);
+    Real *ptol = presidual_tolerance == nullptr ? &(params_.residual_tolerance) : presidual_tolerance;
     auto check = itl.SetCompletionTask(
         update_p | correct_x,
         [](BiCGSTABSolver *solver, Mesh *pmesh, int partition, int max_iter,
@@ -280,7 +281,7 @@ class BiCGSTABSolver {
           solver->residual.val = 0.0;
           return TaskStatus::iterate;
         },
-        this, pmesh, i, params_.max_iters, presidual_tolerance);
+        this, pmesh, i, params_.max_iters, ptol);
     region.AddGlobalDependencies(reg_dep_id, i, check);
     reg_dep_id++;
 

From a72e7b2e7d447cf01fa5bc35e01180c83c4474a0 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Mon, 11 Mar 2024 16:56:52 -0600
Subject: [PATCH 28/39] Merge develop

---
 CHANGELOG.md                                  |  18 +
 CMakeLists.txt                                |   7 +-
 benchmarks/burgers/README.md                  |   7 +-
 benchmarks/burgers/burgers.pin                |   5 +
 benchmarks/burgers/burgers_diff.py            |  57 ++
 benchmarks/burgers/burgers_package.cpp        |  95 +-
 benchmarks/burgers/burgers_package.hpp        |  12 +-
 benchmarks/burgers/parthenon_app_inputs.cpp   |   2 +-
 doc/sphinx/src/README.rst                     |  16 +-
 doc/sphinx/src/instrumentation.rst            |  29 +
 doc/sphinx/src/interface/state.rst            |  19 +-
 doc/sphinx/src/mesh/mesh.rst                  |   5 +
 doc/sphinx/src/nested_par_for.rst             | 153 ++-
 doc/sphinx/src/outputs.rst                    |   1 +
 doc/sphinx/src/parthenon_manager.rst          |  13 +-
 doc/sphinx/src/particles.rst                  |  10 +-
 doc/sphinx/src/tasks.rst                      | 210 ++--
 example/advection/advection_driver.hpp        |   1 +
 example/advection/advection_package.cpp       |  38 +-
 example/advection/custom_ascent_actions.yaml  |   7 +-
 example/advection/main.cpp                    |   1 +
 example/advection/parthenon_app_inputs.cpp    |  26 +-
 example/advection/parthinput.advection        |   3 +-
 example/calculate_pi/calculate_pi.cpp         |   2 +-
 example/calculate_pi/pi_driver.cpp            |   4 +-
 example/calculate_pi/pi_driver.hpp            |   2 +-
 example/kokkos_pi/kokkos_pi.cpp               |   2 +-
 .../particle_leapfrog/particle_leapfrog.cpp   |  14 +-
 example/particle_tracers/particle_tracers.cpp |  27 +-
 example/particles/particles.cpp               | 146 ++-
 example/poisson/parthenon_app_inputs.cpp      |   4 +-
 example/poisson/poisson_driver.cpp            | 236 ++---
 example/poisson/poisson_package.cpp           |  32 +-
 example/poisson_gmg/parthinput.poisson        |   8 +-
 example/poisson_gmg/poisson_driver.cpp        |   8 +-
 .../sparse_advection/parthenon_app_inputs.cpp |   4 +-
 .../sparse_advection_package.cpp              |  11 +-
 .../stochastic_subgrid_package.cpp            |  29 +-
 .../process_timer.py                          | 164 ++++
 src/CMakeLists.txt                            |  20 +-
 src/amr_criteria/refinement_package.cpp       |  17 +-
 src/application_input.hpp                     |   8 +-
 src/basic_types.hpp                           |  13 +-
 src/bvals/boundary_conditions.cpp             |   3 +-
 src/bvals/boundary_conditions_generic.hpp     |   2 +-
 src/bvals/bvals_base.cpp                      |   5 +-
 src/bvals/comms/bnd_info.cpp                  |  10 +-
 src/bvals/comms/bnd_info.hpp                  |   2 +
 src/bvals/comms/boundary_communication.cpp    |  45 +-
 src/bvals/comms/build_boundary_buffers.cpp    |   3 +-
 src/bvals/comms/bvals_in_one.hpp              |   8 +-
 src/bvals/comms/bvals_utils.hpp               |   4 +-
 src/bvals/comms/flux_correction.cpp           |  19 +-
 src/bvals/comms/tag_map.cpp                   |  14 +-
 src/coordinates/uniform_cartesian.hpp         |  15 +-
 src/defs.hpp                                  |   2 +-
 src/driver/driver.cpp                         |  96 +-
 src/driver/driver.hpp                         |   2 +-
 src/driver/multistage.hpp                     |   8 +-
 src/interface/mesh_data.cpp                   |  20 +
 src/interface/mesh_data.hpp                   |  13 +-
 src/interface/meshblock_data.cpp              |   3 +-
 src/interface/metadata.cpp                    |   6 +-
 src/interface/metadata.hpp                    |  11 +-
 src/interface/params.cpp                      |  17 +-
 src/interface/params.hpp                      |   2 +-
 src/interface/sparse_pack.hpp                 |  79 +-
 src/interface/sparse_pool.cpp                 |   4 +-
 src/interface/swarm.cpp                       | 914 ++----------------
 src/interface/swarm.hpp                       |  87 +-
 src/interface/swarm_comms.cpp                 | 740 ++++++++++++++
 src/interface/swarm_container.cpp             |  26 +-
 src/interface/swarm_device_context.hpp        |  28 +-
 src/interface/update.cpp                      |  16 +-
 src/interface/update.hpp                      |  78 +-
 src/kokkos_abstraction.hpp                    |  26 +-
 src/mesh/amr_loadbalance.cpp                  | 442 +++++----
 src/mesh/domain.hpp                           |   3 +-
 src/mesh/mesh-gmg.cpp                         |  24 +
 src/mesh/mesh.cpp                             |  59 +-
 src/mesh/mesh.hpp                             |  25 +-
 src/mesh/meshblock.cpp                        |   8 +-
 src/mesh/meshblock.hpp                        |   4 +-
 src/outputs/ascent.cpp                        |   2 +-
 src/outputs/output_utils.cpp                  |  67 +-
 src/outputs/output_utils.hpp                  |  58 +-
 src/outputs/outputs.cpp                       |   8 +-
 src/outputs/outputs.hpp                       |  23 +-
 src/outputs/parthenon_hdf5.cpp                | 449 +++------
 src/outputs/parthenon_hdf5.hpp                | 170 +---
 src/outputs/parthenon_hdf5_attributes.cpp     | 141 +++
 .../parthenon_hdf5_attributes_read.cpp        |  61 ++
 .../parthenon_hdf5_attributes_write.cpp       |  61 ++
 src/outputs/parthenon_hdf5_base.hpp           | 106 ++
 src/outputs/parthenon_hdf5_types.hpp          | 170 ++++
 src/outputs/restart.hpp                       |   2 +-
 src/parthenon/driver.hpp                      |   5 +-
 src/parthenon/package.hpp                     |   2 +
 src/parthenon/prelude.hpp                     |   1 +
 .../task_id.hpp => parthenon_arrays.cpp}      |  43 +-
 src/parthenon_arrays.hpp                      |  18 +-
 src/parthenon_manager.cpp                     |  23 +-
 src/pgen/default_pgen.cpp                     |  21 +-
 src/prolong_restrict/pr_loops.hpp             |  10 +-
 src/solvers/bicgstab_solver.hpp               | 112 +--
 src/solvers/mg_solver.hpp                     |  72 +-
 src/solvers/solver_utils.hpp                  |  77 +-
 src/tasks/task_id.cpp                         | 162 ----
 src/tasks/task_list.hpp                       | 534 ----------
 src/tasks/task_types.hpp                      | 102 --
 src/tasks/tasks.hpp                           | 500 ++++++++++
 src/tasks/thread_pool.hpp                     | 139 +++
 src/utils/buffer_utils.cpp                    |  10 +-
 src/utils/index_split.cpp                     | 128 +++
 src/utils/index_split.hpp                     | 118 +++
 src/utils/instrument.hpp                      |  53 +
 .../advection_outflow/advection_outflow.py    |  32 +-
 .../parthinput.advection_outflow              |   2 +-
 tst/regression/utils/test_case.py             |   7 +-
 tst/style/cpplint.py                          |   8 +-
 tst/unit/CMakeLists.txt                       |   3 +-
 tst/unit/test_index_split.cpp                 | 281 ++++++
 tst/unit/test_metadata.cpp                    |  12 -
 tst/unit/test_sparse_pack.cpp                 |  87 +-
 tst/unit/test_state_descriptor.cpp            |  12 +-
 tst/unit/test_swarm.cpp                       |  13 +-
 tst/unit/test_taskid.cpp                      |  36 +-
 tst/unit/test_tasklist.cpp                    |   2 +-
 128 files changed, 4994 insertions(+), 3308 deletions(-)
 create mode 100644 benchmarks/burgers/burgers_diff.py
 create mode 100644 doc/sphinx/src/instrumentation.rst
 create mode 100644 scripts/python/packages/parthenon_process_kernel_timer/process_timer.py
 create mode 100644 src/interface/swarm_comms.cpp
 create mode 100644 src/outputs/parthenon_hdf5_attributes.cpp
 create mode 100644 src/outputs/parthenon_hdf5_attributes_read.cpp
 create mode 100644 src/outputs/parthenon_hdf5_attributes_write.cpp
 create mode 100644 src/outputs/parthenon_hdf5_base.hpp
 create mode 100644 src/outputs/parthenon_hdf5_types.hpp
 rename src/{tasks/task_id.hpp => parthenon_arrays.cpp} (51%)
 delete mode 100644 src/tasks/task_id.cpp
 delete mode 100644 src/tasks/task_list.hpp
 delete mode 100644 src/tasks/task_types.hpp
 create mode 100644 src/tasks/tasks.hpp
 create mode 100644 src/tasks/thread_pool.hpp
 create mode 100644 src/utils/index_split.cpp
 create mode 100644 src/utils/index_split.hpp
 create mode 100644 src/utils/instrument.hpp
 create mode 100644 tst/unit/test_index_split.cpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 844f68d39003..74cd391301e8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,20 +3,38 @@
 ## Current develop
 
 ### Added (new features/APIs/variables/...)
+- [[PR 852]](https://github.com/parthenon-hpc-lab/parthenon/pull/852) Add Mesh version of UserWorkBeforeOutput
+- [[PR 998]](https://github.com/parthenon-hpc-lab/parthenon/pull/998) tensor indices added to sparse pack
+- [[PR 999]](https://github.com/parthenon-hpc-lab/parthenon/pull/999) Add a post-initialization hook
+- [[PR 987]](https://github.com/parthenon-hpc-lab/parthenon/pull/987) New tasking infrastructure and capabilities
+- [[PR 969]](https://github.com/parthenon-hpc-lab/parthenon/pull/969) New macro-based auto-naming of profiling regions and kernels
+- [[PR 981]](https://github.com/parthenon-hpc-lab/parthenon/pull/981) Add IndexSplit
+- [[PR 983]](https://github.com/parthenon-hpc-lab/parthenon/pull/983) Add Contains to SparsePack
 - [[PR 968]](https://github.com/parthenon-hpc-lab/parthenon/pull/968) Add per package registration of boundary conditions
 - [[PR 948]](https://github.com/parthenon-hpc-lab/parthenon/pull/948) Add solver interface and update Poisson geometric multi-grid example
+- [[PR 996]](https://github.com/parthenon-hpc-lab/parthenon/pull/996) Remove dynamic allocations from swarm particle creation
 
 ### Changed (changing behavior/API/variables/...)
 
 ### Fixed (not changing behavior/API/variables/...)
+- [[PR1012]](https://github.com/parthenon-hpc-lab/parthenon/pull/1012) Remove accidentally duplicated code
+- [[PR992]](https://github.com/parthenon-hpc-lab/parthenon/pull/992) Allow custom PR ops with sparse pools
+- [[PR988]](https://github.com/parthenon-hpc-lab/parthenon/pull/988) Fix bug in neighbor finding routine for small, periodic, refined meshes
+- [[PR986]](https://github.com/parthenon-hpc-lab/parthenon/pull/986) Fix bug in sparse boundary communication BndInfo cacheing
 - [[PR978]](https://github.com/parthenon-hpc-lab/parthenon/pull/978) remove erroneous sparse check
 
 ### Infrastructure (changes irrelevant to downstream codes)
+- [[PR 1017]](https://github.com/parthenon-hpc-lab/parthenon/pull/1017) Make regression tests more verbose on failure
+- [[PR 1007]](https://github.com/parthenon-hpc-lab/parthenon/pull/1007) Split template instantiations for HDF5 Read/Write attributes to speed up compile times
+- [[PR 990]](https://github.com/parthenon-hpc-lab/parthenon/pull/990) Partial refactor of HDF5 I/O code for readability/extendability
+- [[PR 982]](https://github.com/parthenon-hpc-lab/parthenon/pull/982) add some gut check testing for parthenon-VIBE
 
 ### Removed (removing behavior/API/varaibles/...)
 
 ### Incompatibilities (i.e. breaking changes)
+- [[PR 987]](https://github.com/parthenon-hpc-lab/parthenon/pull/987) Change the API for what was IterativeTasks
 - [[PR 974]](https://github.com/parthenon-hpc-lab/parthenon/pull/974) Change GetParentPointer to always return T*
+- [[PR 996]](https://github.com/parthenon-hpc-lab/parthenon/pull/996) Remove dynamic allocations from swarm particle creation
 
 
 ## Release 23.11
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 994399d4bdc0..7d0d4ab3a496 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,9 +62,9 @@ include(cmake/Format.cmake)
 include(cmake/Lint.cmake)
 
 # regression test reference data
-set(REGRESSION_GOLD_STANDARD_VER 20 CACHE STRING "Version of gold standard to download and use")
+set(REGRESSION_GOLD_STANDARD_VER 21 CACHE STRING "Version of gold standard to download and use")
 set(REGRESSION_GOLD_STANDARD_HASH
-  "SHA512=e5e421f3c0be01e4708965542bb8b1b79b5c96de97091e46972e375c7616588d026a9a8e29226d9c7ef75346bc859fd9af72acdc7e95e0d783b5ef29aa4630b1"
+  "SHA512=e16b14272915b4607965e5900961402f6da96dc13da8ea3c3d213d61f82d3a1dded08c40a9ab644aa3409d93a045bba360a90a43dc289b24f525878f9ba50890"
   CACHE STRING "Hash of default gold standard file to download")
 option(REGRESSION_GOLD_STANDARD_SYNC "Automatically sync gold standard files." ON)
 
@@ -116,6 +116,9 @@ endif()
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 find_package(Filesystem REQUIRED COMPONENTS Experimental Final)
 
+# Require threading for tasks
+find_package(Threads)
+
 set(ENABLE_MPI OFF)
 set(NUM_MPI_PROC_TESTING "4" CACHE STRING "Number of mpi processors to use when running tests with MPI")
 if (NOT PARTHENON_DISABLE_MPI)
diff --git a/benchmarks/burgers/README.md b/benchmarks/burgers/README.md
index 9b4acfb85243..fe84fc5b0924 100644
--- a/benchmarks/burgers/README.md
+++ b/benchmarks/burgers/README.md
@@ -74,6 +74,12 @@ To build for execution on a single GPU, it should be sufficient to add the follo
 ```
 where `Kokkos_ARCH` should be set appropriately for the machine (see [here](https://kokkos.github.io/kokkos-core-wiki/keywords.html)).
 
+### Diagnostics
+
+Parthenon-VIBE prints to a history file (default name `burgers.hst`) a time series of the sum of squares of evolved variables integrated over volume for each octant of the domain, as well as the total number of meshblocks in the simulation at that time. To compare these quantities between runs, we provide the `burgers_diff.py` program in the benchmark folder. This will diff two history files and report when the relative difference is greater than some tolerance.
+
+Note that `burgers.hst` is **appended** to when the executable is re-run. So if you want to compare two different history files, rename the history file by changing either `problem_id` in the `parthenon/job` block in the input deck (this can be done on the command line. When you start the program, add `parthenon/job/problem_id=mynewname` to the command line argument), or copy the old file to back it up.
+
 ### Memory Usage
 
 The dominant memory usage in Parthenon-VIBE is for storage of the solution, for which two copies are required to support second order time stepping, for storing the update for a integrator stage (essentially the flux divergence), the intercell fluxes of each variable, for intermediate values of each solution variable on each side of every face, and for a derived quantity that we compute from the evolved solution.  From this we can construct a simple model for the memory usage $M$ as 
@@ -110,4 +116,3 @@ For the GPU, we measure throughput on a single-level mesh ("parthenon/mesh/numle
 
 <p style="text-align:center;"><img src="data/pvibe_gpu_throughput.png" alt="Plot showing throughput on an A100 at different mesh and block sizes" style=width:50%><br />Figure 3: Throughput for different mesh and block sizes on a single 40 GB A100 GPU.</p>
 
-
diff --git a/benchmarks/burgers/burgers.pin b/benchmarks/burgers/burgers.pin
index cac4565b74ad..257dcf6cec72 100644
--- a/benchmarks/burgers/burgers.pin
+++ b/benchmarks/burgers/burgers.pin
@@ -67,6 +67,11 @@ file_type = hdf5
 dt = -0.4
 variables = U, derived
 
+<parthenon/output1>
+file_type = hst
+data_format = %.14e
+dt = 0.01
+
 <burgers>
 cfl = 0.8
 recon = weno5
diff --git a/benchmarks/burgers/burgers_diff.py b/benchmarks/burgers/burgers_diff.py
new file mode 100644
index 000000000000..d91323d9baae
--- /dev/null
+++ b/benchmarks/burgers/burgers_diff.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python
+# ========================================================================================
+# (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+#
+# This program was produced under U.S. Government contract 89233218CNA000001 for Los
+# Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+# for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+# in the program are reserved by Triad National Security, LLC, and the U.S. Department
+# of Energy/National Nuclear Security Administration. The Government is granted for
+# itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+# license in this material to reproduce, prepare derivative works, distribute copies to
+# the public, perform publicly and display publicly, and to permit others to do so.
+# ========================================================================================
+
+import sys
+import numpy as np
+from argparse import ArgumentParser
+
+parser = ArgumentParser(
+    prog="burgers_diff.py",
+    description="Compute difference between two history solvers parthenon VIBE",
+)
+parser.add_argument("file1", type=str, help="First file in diff")
+parser.add_argument("file2", type=str, help="Second fiel in diff")
+parser.add_argument(
+    "-t", "--tolerance", type=float, default=1e-8, help="Relative tolerance for diff"
+)
+
+
+def get_rel_diff(d1, d2):
+    "Get relative difference between two numpy arrays"
+    return 2 * np.abs(d1 - d2) / (d1 + d2 + 1e-20)
+
+
+def compare_files(file1, file2, tolerance, print_results=True):
+    "Compare file1 and file2 to tolerance. Optionally print results."
+    d1 = np.loadtxt(file1)
+    d2 = np.loadtxt(file2)
+    diffs = get_rel_diff(d1, d2)
+    mask = diffs > tolerance
+    errcode = 0
+    if np.any(mask):
+        errcode = 1
+        if print_results:
+            print("Diffs found!")
+            indices = np.transpose(np.nonzero(mask))
+            print("Diff locations (row, column) =", indices)
+            print("Diffs =", diffs[mask])
+    else:
+        if print_results:
+            print("No diffs found!")
+    return errcode
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    sys.exit(compare_files(args.file1, args.file1, args.tolerance, True))
diff --git a/benchmarks/burgers/burgers_package.cpp b/benchmarks/burgers/burgers_package.cpp
index 64201440b80b..55ac7b89279d 100644
--- a/benchmarks/burgers/burgers_package.cpp
+++ b/benchmarks/burgers/burgers_package.cpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -93,6 +93,48 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin) {
   m = Metadata({Metadata::Cell, Metadata::Derived, Metadata::OneCopy});
   pkg->AddField("derived", m);
 
+  // Compute the octants
+  std::vector<Region> octants;
+  std::vector<Real> mesh_mins, mesh_maxs, mesh_mids;
+  Real mesh_vol = 1;
+  for (int d = 1; d <= 3; ++d) {
+    mesh_mins.push_back(pin->GetReal("parthenon/mesh", "x" + std::to_string(d) + "min"));
+    mesh_maxs.push_back(pin->GetReal("parthenon/mesh", "x" + std::to_string(d) + "max"));
+    mesh_vol *= (mesh_maxs.back() - mesh_mins.back());
+    mesh_mids.push_back(0.5 * (mesh_mins.back() + mesh_maxs.back()));
+  }
+  pkg->AddParam("mesh_volume", mesh_vol);
+  for (int side1 = 0; side1 < 2; ++side1) {
+    Region r;
+    r.xmin[0] = side1 ? mesh_mids[0] : mesh_mins[0];
+    r.xmax[0] = side1 ? mesh_maxs[0] : mesh_mids[0];
+    for (int side2 = 0; side2 < 2; ++side2) {
+      r.xmin[1] = side2 ? mesh_mids[1] : mesh_mins[1];
+      r.xmax[1] = side2 ? mesh_maxs[1] : mesh_mids[1];
+      for (int side3 = 0; side3 < 2; ++side3) {
+        r.xmin[2] = side3 ? mesh_mids[2] : mesh_mins[2];
+        r.xmax[2] = side3 ? mesh_maxs[2] : mesh_mids[2];
+        octants.push_back(r);
+      }
+    }
+  }
+
+  // Histories
+  auto HstSum = parthenon::UserHistoryOperation::sum;
+  using parthenon::HistoryOutputVar;
+  parthenon::HstVar_list hst_vars = {};
+  int i_octant = 0;
+  for (auto &octant : octants) {
+    auto ReduceMass = [=](MeshData<Real> *md) {
+      return MassHistory(md, octant.xmin[0], octant.xmax[0], octant.xmin[1],
+                         octant.xmax[1], octant.xmin[2], octant.xmax[2]);
+    };
+    hst_vars.emplace_back(HstSum, ReduceMass, "MS Mass " + std::to_string(i_octant));
+    i_octant++;
+  }
+  hst_vars.emplace_back(HstSum, MeshCountHistory, "Meshblock count");
+  pkg->AddParam(parthenon::hist_param_key, hst_vars);
+
   pkg->EstimateTimestepMesh = EstimateTimestepMesh;
   pkg->FillDerivedMesh = CalculateDerived;
 
@@ -110,7 +152,7 @@ void CalculateDerived(MeshData<Real> *md) {
   size_t scratch_size = 0;
   constexpr int scratch_level = 0;
   parthenon::par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "CalculateDerived", DevExecSpace(), scratch_size,
+      DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), scratch_size,
       scratch_level, 0, nblocks - 1, kb.s, kb.e, jb.s, jb.e,
       KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k, const int j) {
         Real *out = &v(b, 0, k, j, 0);
@@ -127,8 +169,8 @@ void CalculateDerived(MeshData<Real> *md) {
 
 // provide the routine that estimates a stable timestep for this package
 Real EstimateTimestepMesh(MeshData<Real> *md) {
-  Kokkos::Profiling::pushRegion("Task_burgers_EstimateTimestepMesh");
-  auto pm = md->GetParentPointer();
+  PARTHENON_INSTRUMENT
+  Mesh *pm = md->GetMeshPointer();
   IndexRange ib = md->GetBoundsI(IndexDomain::interior);
   IndexRange jb = md->GetBoundsJ(IndexDomain::interior);
   IndexRange kb = md->GetBoundsK(IndexDomain::interior);
@@ -155,14 +197,13 @@ Real EstimateTimestepMesh(MeshData<Real> *md) {
       },
       Kokkos::Min<Real>(min_dt));
 
-  Kokkos::Profiling::popRegion(); // Task_burgers_EstimateTimestepMesh
   return cfl * min_dt;
 }
 
 TaskStatus CalculateFluxes(MeshData<Real> *md) {
   using parthenon::ScratchPad1D;
   using parthenon::team_mbr_t;
-  Kokkos::Profiling::pushRegion("Task_burgers_CalculateFluxes");
+  PARTHENON_INSTRUMENT
 
   auto pm = md->GetParentPointer();
   const int ndim = pm->ndim;
@@ -194,7 +235,7 @@ TaskStatus CalculateFluxes(MeshData<Real> *md) {
   size_t scratch_size = 0;
   constexpr int scratch_level = 0;
   parthenon::par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "burgers::reconstruction", DevExecSpace(), scratch_size,
+      DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), scratch_size,
       scratch_level, 0, nblocks - 1, kb.s - dk, kb.e + dk, jb.s - dj, jb.e + dj,
       KOKKOS_LAMBDA(team_mbr_t member, const int b, const int k, const int j) {
         bool xrec = (k >= kb.s && k <= kb.e) && (j >= jb.s && j <= jb.e);
@@ -265,7 +306,7 @@ TaskStatus CalculateFluxes(MeshData<Real> *md) {
   // now we'll solve the Riemann problems to get fluxes
   scratch_size = 2 * ScratchPad1D<Real>::shmem_size(ib.e + 1);
   parthenon::par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "burgers::reconstruction", DevExecSpace(), scratch_size,
+      DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), scratch_size,
       scratch_level, 0, nblocks - 1, kb.s, kb.e + dk, jb.s, jb.e + dj,
       KOKKOS_LAMBDA(team_mbr_t member, const int b, const int k, const int j) {
         bool xflux = (k <= kb.e && j <= jb.e);
@@ -360,8 +401,44 @@ TaskStatus CalculateFluxes(MeshData<Real> *md) {
         }
       });
 
-  Kokkos::Profiling::popRegion(); // Task_burgers_CalculateFluxes
   return TaskStatus::complete;
 }
 
+Real MassHistory(MeshData<Real> *md, const Real x1min, const Real x1max, const Real x2min,
+                 const Real x2max, const Real x3min, const Real x3max) {
+  const auto ib = md->GetBoundsI(IndexDomain::interior);
+  const auto jb = md->GetBoundsJ(IndexDomain::interior);
+  const auto kb = md->GetBoundsK(IndexDomain::interior);
+
+  Mesh *pm = md->GetMeshPointer();
+  auto &params = pm->packages.Get("burgers_package")->AllParams();
+  const auto &mesh_vol = params.Get<Real>("mesh_volume");
+
+  std::vector<std::string> vars = {"U"};
+  const auto pack = md->PackVariables(vars);
+
+  Real result = 0.0;
+  parthenon::par_reduce(
+      parthenon::LoopPatternMDRange(), "MassHistory", DevExecSpace(), 0,
+      pack.GetDim(5) - 1, 0, pack.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      KOKKOS_LAMBDA(const int b, const int v, const int k, const int j, const int i,
+                    Real &lresult) {
+        const auto &coords = pack.GetCoords(b);
+        const Real vol = coords.CellVolume(k, j, i);
+        const Real weight = vol / (mesh_vol + 1e-20);
+        const Real x1 = coords.Xc<X1DIR>(k, j, i);
+        const Real x2 = coords.Xc<X2DIR>(k, j, i);
+        const Real x3 = coords.Xc<X3DIR>(k, j, i);
+        // Inclusive bounds are appropriate here because cell-centered
+        // coordinates are passed in, not edges.
+        const Real mask = (x1min <= x1) && (x1 <= x1max) && (x2min <= x2) &&
+                          (x2 <= x2max) && (x3min <= x3) && (x3 <= x3max);
+        lresult += mask * pack(b, v, k, j, i) * pack(b, v, k, j, i) * weight;
+      },
+      Kokkos::Sum<Real>(result));
+  return result;
+}
+
+Real MeshCountHistory(MeshData<Real> *md) { return md->NumBlocks(); }
+
 } // namespace burgers_package
diff --git a/benchmarks/burgers/burgers_package.hpp b/benchmarks/burgers/burgers_package.hpp
index b4933cf4fec6..2585d1c59ba3 100644
--- a/benchmarks/burgers/burgers_package.hpp
+++ b/benchmarks/burgers/burgers_package.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -25,6 +25,9 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin);
 void CalculateDerived(MeshData<Real> *md);
 Real EstimateTimestepMesh(MeshData<Real> *md);
 TaskStatus CalculateFluxes(MeshData<Real> *md);
+Real MassHistory(MeshData<Real> *md, const Real x1min, const Real x1max, const Real x2min,
+                 const Real x2max, const Real x3min, const Real x3max);
+Real MeshCountHistory(MeshData<Real> *md);
 
 // compute the hll flux for Burgers' equation
 KOKKOS_INLINE_FUNCTION
@@ -40,6 +43,13 @@ void lr_to_flux(const Real uxl, const Real uxr, const Real uyl, const Real uyr,
   fuz = 0.5 * (sr * uzl * upl - sl * uzr * upr + sl * sr * (uzr - uzl)) * islsr;
 }
 
+// JMM: I could have instead used the parthenon::RegionSize
+// class. However, this little Region struct is lighter weight and
+// easier to work with in this context.
+struct Region {
+  std::array<Real, 3> xmin, xmax;
+};
+
 } // namespace burgers_package
 
 #endif // BENCHMARKS_BURGERS_BURGERS_PACKAGE_HPP_
diff --git a/benchmarks/burgers/parthenon_app_inputs.cpp b/benchmarks/burgers/parthenon_app_inputs.cpp
index 94d24b8e2068..c831d4ef7802 100644
--- a/benchmarks/burgers/parthenon_app_inputs.cpp
+++ b/benchmarks/burgers/parthenon_app_inputs.cpp
@@ -53,7 +53,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   const auto num_vars = q.GetDim(4);
 
   pmb->par_for(
-      "Burgers::ProblemGenerator", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         const Real x = coords.Xc<1>(i);
         const Real y = coords.Xc<2>(j);
diff --git a/doc/sphinx/src/README.rst b/doc/sphinx/src/README.rst
index 521cabf3bc03..5161f21c68e9 100644
--- a/doc/sphinx/src/README.rst
+++ b/doc/sphinx/src/README.rst
@@ -178,9 +178,12 @@ Mesh
 ^^^^
 
 -  ``InitUserMeshData``
+-  ``ProblemGenerator``
+-  ``PostInitialization``
 -  ``PreStepUserWorkInLoop``
 -  ``PostStepUserWorkInLoop``
 -  ``UserWorkAfterLoop``
+-  ``UserMeshWorkBeforeOutput``
 
 MeshBlock
 ^^^^^^^^^
@@ -188,6 +191,7 @@ MeshBlock
 -  ``InitApplicationMeshBlockData``
 -  ``InitMeshBlockUserData``
 -  ``ProblemGenerator``
+-  ``PostInitialization``
 -  ``UserWorkBeforeOutput``
 
 To redefine these functions, the user sets the respective function
@@ -195,12 +199,12 @@ pointers in the ApplicationInput member app_input of the
 ParthenonManager class prior to calling ``ParthenonInit``. This is
 demonstrated in the ``main()`` functions in the examples.
 
-Note that the ``ProblemGenerator``\ s of ``Mesh`` and ``MeshBlock`` are
-mutually exclusive. Moreover, the ``Mesh`` one requires
-``parthenon/mesh/pack_size=-1`` during initialization, i.e., all blocks
-on a rank need to be in a single pack. This allows to use MPI reductions
-inside the function, for example, to globally normalize quantities. The
-``parthenon/mesh/pack_size=-1`` exists only during problem
+Note that the ``ProblemGenerator``\ s (and ``PostInitialization``\ s) of
+``Mesh`` and ``MeshBlock`` are mutually exclusive. Moreover, the ``Mesh``
+ones requires ``parthenon/mesh/pack_size=-1`` during initialization, i.e.,
+all blocks on a rank need to be in a single pack. This allows to use MPI
+reductions inside the function, for example, to globally normalize quantities.
+The ``parthenon/mesh/pack_size=-1`` exists only during problem
 inititalization, i.e., simulations can be restarted with an arbitrary
 ``pack_size``. For an example of the ``Mesh`` version, see the `Poisson
 example <https://github.com/parthenon-hpc-lab/parthenon/blob/develop/example/poisson/parthenon_app_inputs.cpp>`__.
diff --git a/doc/sphinx/src/instrumentation.rst b/doc/sphinx/src/instrumentation.rst
new file mode 100644
index 000000000000..14f8c643cf74
--- /dev/null
+++ b/doc/sphinx/src/instrumentation.rst
@@ -0,0 +1,29 @@
+.. _instrumentation:
+
+Performance Instrumentation
+===========================
+
+Parthenon provides several macros that make instrumenting your code simple.  For now,
+these macros instantiate Kokkos profiling regions via calls to
+``Kokkos::Profiling::pushRegion`` and ``Kokkos::Profiling::popRegion``, meaning all the
+Kokkos profiling tools should work straightforwardly with Parthenon-based applications.
+
+- ``PARTHENON_INSTRUMENT``: Instantiates an object that pushes a profiling region on
+  construction and pops the region on destruction.  The name of the region is
+  auto-generated and takes the form ``"file_name::line_number::function_name"``.  The region
+  being profiled is controlled by invoking the macro at the appropriate scope.
+- ``PARTHENON_INSTRUMENT_REGION(name)``: Same as ``PARTHENON_INSTRUMENT``, but uses the
+  provided name instead of the auto-generated name.
+- ``PARTHENON_INSTRUMENT_REGION_PUSH``: A trivial wrapper around ``pushRegion`` where
+  the name is auto-generated as above.
+- ``PARTHENON_INSTRUMENT_REGION_POP``: A trivial wrapper around ``popRegion``.
+
+In addition to these macros, Parthenon provides the ``PARTHENON_AUTO_LABEL`` macro which
+can be used to provide a label to kernels (e.g. through the various ``par_for``
+functions).  The auto-generated name is the same as was described above.
+
+Though not required, the use of the auto-generated names is highly recommended.  In
+addition to avoiding possible name collisions, the auto-generated names provide a simple
+structure that is amenable to post-processing profiling results to ease analysis.  For
+example, the ``process_timer.py`` script that ships with Parthenon post-processes the
+results of the Kokkos simple kernel timer output to provide a convenient view of the data.
\ No newline at end of file
diff --git a/doc/sphinx/src/interface/state.rst b/doc/sphinx/src/interface/state.rst
index e0760d4b8ff1..6a4cfaf840e1 100644
--- a/doc/sphinx/src/interface/state.rst
+++ b/doc/sphinx/src/interface/state.rst
@@ -113,12 +113,19 @@ several useful features and functions.
   if set (defaults to ``nullptr`` an therefore a no-op) to print
   diagnostics after the time-integration advance
 - ``void UserWorkBeforeLoopMesh(Mesh *, ParameterInput *pin, SimTime
-  &tm)`` performs a per-package, mesh-wide calculation after the mesh
-  has been generated, and problem generators called, but before any
-  time evolution. This work is done both on first initialization and
-  on restart. If you would like to avoid doing the work upon restart,
-  you can check for the const ``is_restart`` member field of the ``Mesh``
-  object.
+  &tm)`` performs a per-package, mesh-wide calculation after (1) the mesh
+  has been generated, (2) problem generators are called, and (3) comms
+  are executed, but before any time evolution. This work is done both on
+  first initialization and on restart. If you would like to avoid doing the
+  work upon restart, you can check for the const ``is_restart`` member
+  field of the ``Mesh`` object.  It is worth making a clear distinction
+  between ``UserWorkBeforeLoopMesh`` and ``ApplicationInput``s
+  ``PostInitialization``.  ``PostInitialization`` is very much so tied to
+  initialization, and will not be called upon restarts.  ``PostInitialization``
+  is also carefully positioned after ``ProblemGenerator`` and before
+  ``PreCommFillDerived`` (and hence communications).  In practice, when
+  additional granularity is required inbetween initialization and communication,
+  ``PostInitialization`` may be the desired hook.
 
 The reasoning for providing ``FillDerived*`` and ``EstimateTimestep*``
 function pointers appropriate for usage with both ``MeshData`` and
diff --git a/doc/sphinx/src/mesh/mesh.rst b/doc/sphinx/src/mesh/mesh.rst
index aa891633cdde..88ffa29af2ed 100644
--- a/doc/sphinx/src/mesh/mesh.rst
+++ b/doc/sphinx/src/mesh/mesh.rst
@@ -50,6 +50,11 @@ member.
   time-integration advance. The default behavior calls to each package's
   (StateDesrcriptor's) ``PreStepDiagnostics`` method which, in turn,
   delegates to a ``std::function`` member that defaults to a no-op.
+- ``UserMeshWorkBeforeOutput(Mesh*, ParameterInput*, SimTime const&)``
+  is called to perform mesh-wide work immediately before writing an output
+  (the default is a no-op). The most likely use case is to fill derived
+  fields with updated values before writing them out to disk (or passing
+  them to Ascent for in-situ analysis).
 
 Multi-grid Grids Stored in ``Mesh``
 -----------------------------------
diff --git a/doc/sphinx/src/nested_par_for.rst b/doc/sphinx/src/nested_par_for.rst
index de33ca7d6195..308acf2c4c89 100644
--- a/doc/sphinx/src/nested_par_for.rst
+++ b/doc/sphinx/src/nested_par_for.rst
@@ -51,7 +51,7 @@ Data type for memory in scratch pad/cache memory. Use
 documentation <https://kokkos.github.io/kokkos-core-wiki/ProgrammingGuide/HierarchicalParallelism.html?highlight=hierarchical>`__
 for determining scratch pad memory needs before kernel launch.
 
-Important usage hints
+On Barriers
 ---------------------
 
 In order to ensure that individual threads of a team are synchronized
@@ -70,6 +70,7 @@ write to common variables, see this
 `code <https://github.com/parthenon-hpc-lab/parthenon/issues/659#issuecomment-1346871509>`__
 for an example.
 
+
 Cmake Options
 -------------
 
@@ -86,3 +87,153 @@ GPUs.
 ``#pragma omp simd`` to vectorize the loop, which typically gives better
 vectorization loops than ``PAR_LOOP_INNER_LAYOUT=TVR_INNER_LOOP`` on
 CPUs and so is the default on CPUs.
+
+
+Performance Considerations
+---------------------------
+
+Hierarchical parallelism can produce very performant code, but a
+deeper awareness of how hardware is mapped to threads is required to
+get optimal performance. Here we list a few strategies/considerations.
+
+* On CPU, with `SIMDFOR_INNER_LOOP` you may have trouble vectorizing
+  unless you help the compiler along. One way to do so is to work with
+  raw pointers to contiguous memory, rather than working with views
+  and strides. Even for stencil ops, if you can pull out pointers that
+  represent the different points on the stencil, this can help with
+  vectorization.
+* Similarly on CPUs, due to the cost of starting up a vector op,
+  vectorization will only be a performance win if there's enough work
+  in the inner loop. A minimum of 16 points is required for the op to
+  vectorize at all. Experience shows, however, that at least 64 is
+  really required to see big wins. One strategy for providing enough
+  vector cells in the inner loop is to do a 1D ``SIMDFOR`` inner loop
+  but combine the ``j`` and ``i`` indices by simply looping over the
+  contiguous memory in a rasterized plane on a block.
+* On GPUs, the outer loop typically maps to blocks, while the inner
+  maps to threads. To see good performance, you must both provide
+  enough work in the inner loop to create enough threads to fill in
+  CUDA terms a streaming multiprocessor (SM, equivalent to a Compute
+  Unit or CU on AMD GPUs) with multiple warps (or wavefronts for AMD)
+  to take advantage of pipelining and enough work in the outer loop to
+  create enough blocks to fill all SMs on the GPU divided by the
+  number of simultaneous streams. The number of warps in flight on the
+  inner loop per SM (which is related to "occupancy") will depend
+  positively on length of the inner loop and negatively on higher
+  shared memory usage (scratch pad memory in Kokkos parlance and Local
+  Data Share or LDS on AMD GPUs) and higher register usage. Note that
+  the number of SMs and the available shared memory and registers per
+  SM will vary between GPU architectures and especially between GPU
+  vendors.
+
+IndexSplit
+-------------
+
+To balance the CPU vs GPU hardware considerations of hierarchical
+parallelism, ``Parthenon`` provides a utility, the ``IndexSplit``
+class, defined in the ``utils/index_split.hpp`` header file and
+available in ``<parthenon/package.hpp>`` in the
+``parthenon::package::prelude`` namespace.
+
+In our experience ``IndexSplit`` is most beneficial when working with
+small meshblocks on CPUs, especially in two dimensions. For small
+blocks, we want vectorized operations over contiguous memory for our
+innermost loop, but we want that loop to contain enough work for,
+e.g., vector ops to function. We have often found that the optimal
+split is to fuse j, and i into the inner loop and use k and blocks in
+the outer loop.
+
+The ``IndexSplit`` class can be constructed as
+
+.. code:: cpp
+
+  IndexSplit(MeshData<Real> md, IndexDomain domain, const int nkp, const int njp);
+
+where here ``md`` is a ``MeshData`` object on which you want to
+operate. ``domain`` specifies where in the ``MeshBlock`` you wish to
+operate, for example ``IndexDomain::Interior``. ``nkp`` and ``njp``
+are the number of points in ``X3`` and ``X2`` respectively that are in
+the outer loop. All remaining points are in the inner loop; each team
+will iterate over multiple `k` and/or `j` indices to cover the
+specified `k/j` range. Typically ``MeshBlock`` index in the pack is
+also assumed to be in the outer loop. ``nkp`` and ``njp`` also accept
+special flags ``IndexSplit::all_outer`` and ``IndexSplit::no_outer``,
+which specify that all and none of the indices in that direction
+should be in the outer loop.
+
+.. warning::
+
+  Note that, in contrast to ``njp``, ``nkp`` points in the
+  ``k``-direction are not included in the innermost loop bounds. You
+  must loop over ``k`` by hand inside the outer loop body.
+
+A second constructor alternatively sets the range for ``X3``, ``X2``,
+and ``X1`` explicitly:
+
+.. code:: cpp
+
+  IndexSplit(MeshData<Real> *md, const IndexRange &kb, const IndexRange &jb,
+             const IndexRange &ib, const int nkp, const int njp);
+
+where here ``kb``, ``jb``, and ``ib`` specify the starting and ending
+indices for ``X3``, ``X2``, and ``X1`` respecively.
+
+.. warning::
+
+  Note that, at this time, ``IndexSplit`` doesn't know about
+  face-centered or edge-centered data. To use ``IndexSplit`` with,
+  e.g., face-centered data, set the input ``IndexRange`` quantities to
+  match the shape for the face-centered data (e.g., with the
+  appropriate offsets).
+
+An ``IndexSplit`` object is typically used as:
+
+.. code:: cpp
+
+  using namespace parthenon::package::prelude;
+  using parthenon::ScratchPad1D;
+  using parthenon::IndexSplit;
+  using parthenon::par_for_outer;
+  using parthenon::par_for_inner;
+  using parthenon::team_mbr_t;
+  // Initialize index split object
+  IndexSplit idx_sp(md, IndexDomain::interior, nkp, njp);
+  
+  // Request maximum size in i and j in the inner loop, for scratch
+  const int Ni = idx_sp.get_max_ni();
+  const int Nj = idx_sp = get_max_nj();
+  const in tNmax = Ni * Nj;
+  
+  // single scratch array for i,j
+  auto scratch_size = ScratchPad1D<Real>::shmem_size(Nmax);
+  constexpr int scratch_level = 0;
+  
+  // Par for
+  par_for_outer(
+	  DEFAULT_OUTER_LOOP_PATTERN, "KernalOuter", DevExecSpace(), scratch_size,
+	  scratch_level, 0, nblocks - 1, 0, idx_sp.outer_size() - 1,
+	  KOKKOS_LAMBDA(team_mbr_t member, const int b, const int outer_idx) {
+	    ScratchPad1D<Real> scratch(member.team_scratch(scratch_level), Nmax);
+	    // Get index ranges. Note they depend on where we are in the outer index!
+	    // These give us a sense for where we are in k,j space
+	    const auto krange = idx_sp.GetBoundsK(outer_idx);
+	    const auto jrange = idx_sp.GetBoundsJ(outer_idx);
+	    // This is the loop of contiguous inner memory. May contain i and j!
+	    const auto flattened_inner_ijrange = idx_sp.GetInnerBounds(jrange);
+            const int inner_size = flattened_inner_ijrange.e - flattened_inner_ijrange.s + 1;
+
+	    // Whatever part of k is not in the outer loop can be looped over
+	    // with a normal for loop here
+	    for (int k = krange.s; k <= krange.e; ++k) {
+
+	      // pull out a pointer some variable in some pack. Note
+	      // we pick the 0th index of i at k and jrange.s
+	      Real *var = &pack(b, ivar, k, jrange.s, flattened_inner_ijrange.s);
+
+	      // Do something with the pointer in the inner loop.
+	      par_for_inner(DEFAULT_INNER_LOOP_PATTERN, member, 0, flattened_inner_size,
+	        [&](const int i) {
+		  foo(var[i]);
+		});
+	    }
+	  });
diff --git a/doc/sphinx/src/outputs.rst b/doc/sphinx/src/outputs.rst
index c01c957fdcf5..170c83ebac7e 100644
--- a/doc/sphinx/src/outputs.rst
+++ b/doc/sphinx/src/outputs.rst
@@ -46,6 +46,7 @@ look like
 
    <parthenon/output1>
    file_type = hdf5
+   write_xdmf = true # Determines whether xdmf annotations are output
    # nonexistent variables/swarms are ignored
    variables = density, velocity, & # comments are still ok
                energy               # notice the & continuation character
diff --git a/doc/sphinx/src/parthenon_manager.rst b/doc/sphinx/src/parthenon_manager.rst
index 15211e8bccdb..e0bac60476d3 100644
--- a/doc/sphinx/src/parthenon_manager.rst
+++ b/doc/sphinx/src/parthenon_manager.rst
@@ -31,12 +31,16 @@ runtimes. The function
 
 Calls the ``Initialize(ParameterInput *pin)`` function of all packages
 to be utilized and creates the grid hierarchy, including the ``Mesh``
-and ``MeshBlock`` objects, and calls the ``ProblemGenerator``
-initialization routines.
+and ``MeshBlock`` objects, and calls the ``ProblemGenerator`` (and
+``PostInitialization``) routines.
 
 The reason these functions are split out is to enable decisions to be
 made by the application between reading the input deck and setting up
-the grid. For example, a common use-case is:
+the grid. For example, during problem initialization, ``ProblemGenerator``
+may be used to be the user-facing API to describe initial conditions,
+whereas, ``PostInitialization`` could use those user-specified fields
+to sync *all* fields prior to entering communication routines. A common
+use-case is:
 
 .. code:: cpp
 
@@ -53,13 +57,14 @@ the grid. For example, a common use-case is:
   if (manager_status == ParthenonStatus::error) {
     pman.ParthenonFinalize();
     return 1;
-  } 
+  }
 
   // Redefine parthenon defaults
   pman.app_input->ProcessPackages = MyProcessPackages;
   std::string prob = pman.pin->GetString("app", "problem");
   if (prob == "problem1") {
     pman.app_input->ProblemGenerator = Problem1Generator;
+    pman.app_input->PostInitialization = Problem1PostInitialization;
   } else {
     pman.app_input->ProblemGenerator = Problem2Generator;
   }
diff --git a/doc/sphinx/src/particles.rst b/doc/sphinx/src/particles.rst
index e86fae13ee98..9f78077c3f94 100644
--- a/doc/sphinx/src/particles.rst
+++ b/doc/sphinx/src/particles.rst
@@ -42,13 +42,13 @@ To add particles to a ``Swarm``, one calls
 
 .. code:: cpp
 
-   ParArray1D<bool> new_particles_mask = swarm->AddEmptyParticles(num_to_add, new_indices)
+   NewParticlesContext context = swarm->AddEmptyParticles(num_to_add);
 
 This call automatically resizes the memory pools as necessary and
-returns a ``ParArray1D<bool>`` mask indicating which indices in the
-``ParticleVariable``\ s are newly available. ``new_indices`` is a
-reference to a ``ParArrayND<int>`` of size ``num_to_add`` which contains
-the indices of each newly added particle.
+returns a ``NewParticlesContext`` object that provides the methods
+``int GetNewParticlesMaxIndex()`` to get the max index of the contiguous block
+of indices into the swarm, and ``int GetNewParticleIndex(const int n)`` to
+convert a new particle index into the swarm index.
 
 To remove particles from a ``Swarm``, one first calls
 
diff --git a/doc/sphinx/src/tasks.rst b/doc/sphinx/src/tasks.rst
index d4c0b361b7f9..1ee08dda2dac 100644
--- a/doc/sphinx/src/tasks.rst
+++ b/doc/sphinx/src/tasks.rst
@@ -3,85 +3,84 @@
 Tasks
 =====
 
+Parthenon's tasking infrastructure is how downstream applications describe 
+and execute their work.  Tasks are organized into a hierarchy of objects.
+``TaskCollection``s have one or more ``TaskRegion``s, ``TaskRegion``s have
+one or more ``TaskList``s, and ``TaskList``s can have one or more sublists
+(that are themselves ``TaskList``s).
+
+Task
+----
+
+Though downstream codes never have to interact with the ``Task`` object directly,
+it's useful to describe nonetheless.  A ``Task`` object is essentially a functor
+that stores the necessary data to invoke a downstream code's functions with
+the desired arguments.  Importantly, however, it also stores information that
+relates itself to other tasks, namely the tasks that must be complete before
+it should execute and the tasks that may be available to run after it completes.
+In other words, ``Task``s are nodes in a directed (possibly cyclic) graph, and
+include the edges that connect to it and emerge from it.
+
 TaskList
 --------
 
-The ``TaskList`` class implements methods to build and execute a set of
-tasks with associated dependencies. The class implements a few public
-facing member functions that provide useful functionality for downstream
-apps:
-
-AddTask
-~~~~~~~
-
-``AddTask`` is a templated variadic function that takes the task
-function to be executed, the task dependencies (see ``TaskID`` below),
-and the arguments to the task function as it’s arguments. All arguments
-are captured by value in a lambda for later execution.
-
-When adding functions that are non-static class member functions, a
-slightly different interface is required. The first argument should be
-the class-name-scoped name of the function. For example, for a function
-named ``DoSomething`` in class ``SomeClass``, the first argument would
-be ``&SomeClass::DoSomething``. The second argument should be a pointer
-to the object that should invoke this member function. Finally, the
-dependencies and function arguments should be provided as described
-above.
-
-Examples of both ``AddTask`` calls can be found in the advection example
-`here <https://github.com/parthenon-hpc-lab/parthenon/blob/develop/example/advection/advection_driver.cpp>`__.
-
-AddIteration
-~~~~~~~~~~~~
-
-``AddIteration`` provides a means of grouping a set of tasks together
-that will be executed repeatedly until stopping criteria are satisfied.
-``AddIteration`` returns an ``IterativeTasks`` object which provides
-overloaded ``AddTask`` functions as described above, but internally
-handles the bookkeeping necessary to maintain the association of all the
-tasks associated with the iterative process. A special function
-``SetCompletionTask``, which behaves identically to ``AddTask``, allows
-a task to be defined that evaluates the stopping criteria. The maximum
-number of iterations can be controlled through the ``SetMaxIterations``
-member function and the number of iterations between evaluating the
-stopping criteria can be set with the ``SetCheckInterval`` function.
-
-DoAvailable
-~~~~~~~~~~~
-
-``DoAvailable`` loops over the task list once, executing all tasks whose
-dependencies are satisfied. Completed tasks are removed from the task
-list.
-
-TaskID
-------
-
-The ``TaskID`` class implements methods that allow Parthenon to keep
-track of tasks, their dependencies, and what remains to be completed.
-The main way application code will interact with this object is as a
-returned object from ``TaskList::AddTask`` and as an argument to
-subsequent calls to ``TaskList::AddTask`` as a dependency for other
-tasks. When used as a dependency, ``TaskID`` objects can be combined
-with the bitwise or operator (``|``) to specify multiple dependencies.
+The ``TaskList`` class stores a vector of all the tasks and sublists (a nested
+``TaskList``) added to it.  Additionally, it stores various bookkeeping
+information that facilitate more advanced features described below.  Adding
+tasks and sublists are the only way to interact with ``TaskList`` objects.
+
+The basic call to ``AddTask`` takes the task's dependencies, the function to be
+executed, and the arguments to the function as its arguments.  ``AddTask`` returns
+a ``TaskID`` object that can be used in subsequent calls to ``AddTask`` as a
+dependency either on its own or combined with other ``TaskID``s via the ``|``
+operator.  Use of the ``|`` operator is historical and perhaps a bit misleading as
+it really acts as a logical and -- that is, all tasks combined with ``|`` must be
+complete before the dependencies are satisfied.  An overload of ``AddTask`` takes
+a ``TaskQualifier`` object as the first argument which specifies certain special,
+non-default behaviors.  These will be described below.  Note that the default
+constructor of ``TaskID`` produces a special object that when passed into
+``AddTask`` signifies that the task has no dependencies.
+
+The ``AddSublist`` function adds a nested ``TaskList`` to the ``TaskList`` on
+which its called.  The principle use case for this is to add iterative cycles
+to the graph, allowing one to execute a series of tasks repeatedly until some
+criteria are satisfied.  The call takes as arguments the dependencies (via
+``TaskID``s combined with ``|``) that must be complete before the sublist
+exectues and a ``std::pair<int, int>`` specifying the minimum
+and maximum number of times the sublist should execute.  Passing something like
+``{min_iters, max_iters}`` as the second argument should suffice, with `{1, 1}`
+leading to a sublist that never cycles.  ``AddSublist``
+returns a ``std::pair<TaskList&, TaskID>`` which is conveniently accessed via
+a structured binding, e.g.
+.. code:: cpp
+  TaskID none;
+  auto [child_list, child_list_id] = parent_list.AddSublist(dependencies, {1,3});
+  auto task_id = child_list.AddTask(none, SomeFunction, arg1, arg2);
+In the above example, passing ``none`` as the dependency for the task added to
+``child_list`` does not imply that this task can execute at any time since
+``child_list`` itself has dependencies that must be satisfied before any of its
+tasks can be invoked.
 
 TaskRegion
 ----------
 
-``TaskRegion`` is a lightweight class that wraps
-``std::vector<TaskList>``, providing a little extra functionality.
-During task execution (described below), all task lists in a
-``TaskRegion`` can be operated on concurrently. For example, a
-``TaskRegion`` can be used to construct independent task lists for each
-``MeshBlock``. Occasionally, it is useful to have a task not be
-considered complete until that task completes in all lists of a region.
-For example, a global iterative solver cannot be considered complete
-until the stopping criteria are satisfied everywhere, which may require
-evaluating those criteria in tasks that live in different lists within a
-region. An example of this use case is
-shown `here <https://github.com/parthenon-hpc-lab/parthenon/blob/develop/example/poisson/poisson_driver.cpp>`__. The mechanism
-to mark a task so that dependent tasks will wait until all lists have
-completed it is to call ``AddRegionalDependencies``, as shown in the
-Poisson example.
+Under the hood, a ``TaskRegion`` is a directed, possibly cyclic graph.  The graph
+is built up incrementally as tasks are added to the ``TaskList``s within the 
+``TaskRegion``, and it's construction is completed upon the first time it's
+executed.  ``TaskRegion``s can have one or more ``TaskList``s.  The primary reason
+for this is to allow flexibility in how work is broken up into tasks (and
+eventually kernels).  A region with many lists will produce many small
+tasks/kernels, but may expose more asynchrony (e.g. MPI communication).  A region
+with fewer lists will produce more work per kernel (which may be good for GPUs,
+for example), but may limit asynchrony.  Typically, each list is tied to a unique
+partition of the mesh blocks owned by a rank.  ``TaskRegion`` only provides a few
+public facing functions:
+- ``TaskListStatus Execute(ThreadPool &pool)``: ``TaskRegion``s can be executed, requiring a
+``ThreadPool`` be provided by the caller.  In practice, ``Execute`` is usually
+called from the ``Execute`` member function of ``TaskCollection``.
+- ``TaskList& operator[](const int i)``: return a reference to the ``i``th
+``TaskList`` in the region.
+- ``size_t size()``: return the number of ``TaskList``s in the region.
 
 TaskCollection
 --------------
@@ -120,21 +119,52 @@ is shown below.
 .. figure:: figs/TaskDiagram.png
    :alt: Task Diagram
 
-``TaskCollection`` provides two member functions, ``AddRegion`` and
-``Execute``.
-
-AddRegion
-~~~~~~~~~
-
-``AddRegion`` simply adds a new ``TaskRegion`` to the back of the
-collection and returns it as a reference. The integer argument
-determines how many task lists make up the region.
-
-Execute
-~~~~~~~
-
-Calling the ``Execute`` method on the ``TaskCollection`` executes all
-the tasks that have been added to the collection, processing each
-``TaskRegion`` in the order they were added, and allowing tasks in
-different ``TaskList``\ s but the same ``TaskRegion`` to be executed
-concurrently.
+``TaskCollection`` provides a few 
+public-facing functions:
+- ``TaskRegion& AddRegion(const int num_lists)``: Add and return a reference to
+a new ``TaskRegion`` with the specified number of ``TaskList``s.
+- ``TaskListStatus Execute(ThreadPool &pool)``: Execute all regions in the
+collection.  Regions are executed completely, in the order they were added,
+before moving on to the next region.  Task execution will take advantage of
+the provided ``ThreadPool`` to (possibly) execute tasks across ``TaskList``s
+in each region concurrently.
+- ``TaskListStatus Execute()``: Same as above, but execution will use an
+internally generated ``ThreadPool`` with a single thread.
+
+NOTE: Work remains to make the rest of
+Parthenon thread-safe, so it is currently required to use a ``ThreadPool``
+with one thread.
+
+TaskQualifier
+-------------
+
+``TaskQualifier`` s provide a mechanism for downstream codes to alter the default
+behavior of specific tasks in certain ways.  The qualifiers are described below:
+- ``TaskQualifier::local_sync`` : Tasks marked with ``local_sync`` synchronize across
+lists in a region on a given MPI rank.  Tasks that depend on a ``local_sync``
+marked task gain dependencies from the corresponding task on all lists within
+a region.  A typical use for this qualifier is to do a rank-local reduction, for
+example before initiating a global MPI reduction (which should be done only once
+per rank, not once per ``TaskList``).  Note that Parthenon links tasks across
+lists in the order they are added to each list, i.e. the ``n``th ``local_sync`` task
+in a list is assumed to be associated with the ``n``th ``local_sync`` task in all
+lists in the region.
+- ``TaskQualifier::global_sync`` : Tasks marked with ``global_sync`` implicitly have
+the same semantics as ``local_sync``, but additionally do a global reduction on the
+``TaskStatus`` to determine if/when execution can proceed on to dependent tasks.
+- ``TaskQualifier::completion`` : Tasks marked with ``completion`` can lead to exiting
+execution of the owning ``TaskList``.  If these tasks return ``TaskStatus::complete``
+and the minimum number of iterations of the list have been completed, the remainder
+of the task list will be skipped (or the iteration stopped).  Returning
+``TaskList::iterate`` leads to continued execution/iteration, unless the maximum
+number of iterations has been reached.
+- ``TaskQualifier::once_per_region`` : Tasks with the ``once_per_region`` qualifier
+will only execute once (per iteration, if relevant) regardless of the number of
+``TaskList``s in the region.  This can be useful when, for example, doing MPI
+reductions, printing out some rank-wide state, or calling a ``completion`` task
+that depends on some global condition where all lists would evaluate identical code.
+
+``TaskQualifier`` s can be combined via the ``|`` operator and all combinations are
+supported.  For example, you might mark a task ``global_sync | completion | once_per_region``
+if it were a task to determine whether an iteration should continue that depended
+on some previously reduced quantity.
diff --git a/example/advection/advection_driver.hpp b/example/advection/advection_driver.hpp
index ec13edcc7bea..e0f692150119 100644
--- a/example/advection/advection_driver.hpp
+++ b/example/advection/advection_driver.hpp
@@ -39,6 +39,7 @@ class AdvectionDriver : public MultiStageDriver {
 void ProblemGenerator(MeshBlock *pmb, parthenon::ParameterInput *pin);
 void UserWorkAfterLoop(Mesh *mesh, parthenon::ParameterInput *pin,
                        parthenon::SimTime &tm);
+void UserMeshWorkBeforeOutput(Mesh *pmb, ParameterInput *pin, parthenon::SimTime const &);
 void PostStepMeshUserWorkInLoop(Mesh *mesh, parthenon::ParameterInput *pin,
                                 parthenon::SimTime const &tm);
 parthenon::Packages_t ProcessPackages(std::unique_ptr<parthenon::ParameterInput> &pin);
diff --git a/example/advection/advection_package.cpp b/example/advection/advection_package.cpp
index 49200d0da87c..3008f457d8c9 100644
--- a/example/advection/advection_package.cpp
+++ b/example/advection/advection_package.cpp
@@ -192,6 +192,10 @@ std::shared_ptr<StateDescriptor> Initialize(ParameterInput *pin) {
     pkg->AddSparsePool(field_name, m, std::vector<int>{12, 37});
   }
 
+  // add derived output variable
+  m = Metadata({Metadata::Cell, Metadata::OneCopy}, std::vector<int>({1}));
+  pkg->AddField("my_derived_var", m);
+
   // List (vector) of HistoryOutputVar that will all be enrolled as output variables
   parthenon::HstVar_list hst_vars = {};
   // Now we add a couple of callback functions
@@ -248,8 +252,7 @@ AmrTag CheckRefinement(MeshBlockData<Real> *rc) {
 
   typename Kokkos::MinMax<Real>::value_type minmax;
   pmb->par_reduce(
-      "advection check refinement", 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-      ib.e,
+      PARTHENON_AUTO_LABEL, 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i,
                     typename Kokkos::MinMax<Real>::value_type &lminmax) {
         lminmax.min_val =
@@ -287,7 +290,7 @@ void PreFill(MeshBlockData<Real> *rc) {
     const int out = imap.get("one_minus_advected").first;
     const auto num_vars = rc->Get("advected").data.GetDim(4);
     pmb->par_for(
-        "advection_package::PreFill", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
           v(out + n, k, j, i) = 1.0 - v(in + n, k, j, i);
         });
@@ -311,7 +314,7 @@ void SquareIt(MeshBlockData<Real> *rc) {
   const int out = imap.get("one_minus_advected_sq").first;
   const auto num_vars = rc->Get("advected").data.GetDim(4);
   pmb->par_for(
-      "advection_package::SquareIt", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
         v(out + n, k, j, i) = v(in + n, k, j, i) * v(in + n, k, j, i);
       });
@@ -328,8 +331,8 @@ void SquareIt(MeshBlockData<Real> *rc) {
   if (profile == "smooth_gaussian") {
     const auto &advected = rc->Get("advected").data;
     pmb->par_for(
-        "advection_package::SquareIt bval check", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e,
-        ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
+        PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
           PARTHENON_REQUIRE(advected(n, k, j, i) != 0.0,
                             "Advected not properly initialized.");
         });
@@ -364,8 +367,8 @@ void PostFill(MeshBlockData<Real> *rc) {
     const int out37 = imap.get("one_minus_sqrt_one_minus_advected_sq_37").first;
     const auto num_vars = rc->Get("advected").data.GetDim(4);
     pmb->par_for(
-        "advection_package::PostFill", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-        ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
+        PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
           v(out12 + n, k, j, i) = 1.0 - sqrt(v(in + n, k, j, i));
           v(out37 + n, k, j, i) = 1.0 - v(out12 + n, k, j, i);
         });
@@ -398,7 +401,8 @@ Real AdvectionHst(MeshData<Real> *md) {
   const bool volume_weighting = std::is_same<T, Kokkos::Sum<Real, HostExecSpace>>::value;
 
   pmb->par_reduce(
-      "AdvectionHst", 0, advected_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, 0, advected_pack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
+      ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &lresult) {
         const auto &coords = advected_pack.GetCoords(b);
         // `join` is a function of the Kokkos::ReducerConecpt that allows to use the same
@@ -429,7 +433,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
   // this is obviously overkill for this constant velocity problem
   Real min_dt;
   pmb->par_reduce(
-      "advection_package::EstimateTimestep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i, Real &lmin_dt) {
         if (vx != 0.0)
           lmin_dt = std::min(lmin_dt, coords.Dxc<X1DIR>(k, j, i) / std::abs(vx));
@@ -449,7 +453,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
 TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   using parthenon::MetadataFlag;
 
-  Kokkos::Profiling::pushRegion("Task_Advection_CalculateFluxes");
+  PARTHENON_INSTRUMENT
   auto pmb = rc->GetBlockPointer();
 
   IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
@@ -476,8 +480,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   size_t scratch_size_in_bytes = parthenon::ScratchPad2D<Real>::shmem_size(nvar, nx1);
   // get x-fluxes
   pmb->par_for_outer(
-      "x1 flux", 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e,
-      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+      PARTHENON_AUTO_LABEL, 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+      jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
         parthenon::ScratchPad2D<Real> ql(member.team_scratch(scratch_level), nvar, nx1);
         parthenon::ScratchPad2D<Real> qr(member.team_scratch(scratch_level), nvar, nx1);
         // get reconstructed state on faces
@@ -509,8 +513,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   // get y-fluxes
   if (pmb->pmy_mesh->ndim >= 2) {
     pmb->par_for_outer(
-        "x2 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e + 1,
-        KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+        jb.e + 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clear inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
           // reconstructed arrays for different `j` with `j` being part of the outer
@@ -552,7 +556,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   // get z-fluxes
   if (pmb->pmy_mesh->ndim == 3) {
     pmb->par_for_outer(
-        "x3 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1, jb.s, jb.e,
+        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1,
+        jb.s, jb.e,
         KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clear inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
@@ -592,7 +597,6 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
         });
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Advection_CalculateFluxes
   return TaskStatus::complete;
 }
 
diff --git a/example/advection/custom_ascent_actions.yaml b/example/advection/custom_ascent_actions.yaml
index 6297d89f8332..e76204d1665d 100644
--- a/example/advection/custom_ascent_actions.yaml
+++ b/example/advection/custom_ascent_actions.yaml
@@ -11,4 +11,9 @@
         plt2:
           type: "mesh"
       image_prefix: "ascent_render_%02d"
-
+    scene2:
+      plots:
+        plt2:
+          type: "pseudocolor"
+          field: "my_derived_var"
+      image_prefix: "derived_render_%02d"
diff --git a/example/advection/main.cpp b/example/advection/main.cpp
index a4c3cc6cf7e9..146093bc22eb 100644
--- a/example/advection/main.cpp
+++ b/example/advection/main.cpp
@@ -24,6 +24,7 @@ int main(int argc, char *argv[]) {
   pman.app_input->ProcessPackages = advection_example::ProcessPackages;
   pman.app_input->ProblemGenerator = advection_example::ProblemGenerator;
   pman.app_input->UserWorkAfterLoop = advection_example::UserWorkAfterLoop;
+  pman.app_input->UserMeshWorkBeforeOutput = advection_example::UserMeshWorkBeforeOutput;
 
   // call ParthenonInit to initialize MPI and Kokkos, parse the input deck, and set up
   auto manager_status = pman.ParthenonInitEnv(argc, argv);
diff --git a/example/advection/parthenon_app_inputs.cpp b/example/advection/parthenon_app_inputs.cpp
index d55e0243b9ff..e3d2a60fc132 100644
--- a/example/advection/parthenon_app_inputs.cpp
+++ b/example/advection/parthenon_app_inputs.cpp
@@ -21,6 +21,8 @@
 #include "config.hpp"
 #include "defs.hpp"
 #include "interface/variable_pack.hpp"
+#include "kokkos_abstraction.hpp"
+#include "parameter_input.hpp"
 #include "utils/error_checking.hpp"
 
 using namespace parthenon::package::prelude;
@@ -73,7 +75,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   if (profile == "block") profile_type = 3;
 
   pmb->par_for(
-      "Advection::ProblemGenerator", 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, 0, num_vars - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
         if (profile_type == 0) {
           Real x = cos_a2 * (coords.Xc<1>(i) * cos_a3 + coords.Xc<2>(j) * sin_a3) +
@@ -99,8 +101,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   // initialize some arbitrary cells in the first block that move in all 6 directions
   if (profile_type == 3 && block_id == 0) {
     pmb->par_for(
-        "Advection::ProblemGenerator bvals test", 0, 1,
-        KOKKOS_LAMBDA(const int /*unused*/) {
+        PARTHENON_AUTO_LABEL, 0, 1, KOKKOS_LAMBDA(const int /*unused*/) {
           q(idx_adv, 4, 4, 4) = 10.0;
           q(idx_v, 4, 4, 4) = vx;
           q(idx_adv, 4, 6, 4) = 10.0;
@@ -249,6 +250,25 @@ void UserWorkAfterLoop(Mesh *mesh, ParameterInput *pin, SimTime &tm) {
   return;
 }
 
+void UserMeshWorkBeforeOutput(Mesh *mesh, ParameterInput *pin, SimTime const &) {
+  // loop over blocks
+  for (auto &pmb : mesh->block_list) {
+    auto rc = pmb->meshblock_data.Get(); // get base container
+    auto q = rc->Get("advected").data;
+    auto deriv = rc->Get("my_derived_var").data;
+
+    IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
+    IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
+    IndexRange kb = pmb->cellbounds.GetBoundsK(IndexDomain::interior);
+
+    pmb->par_for(
+        "Advection::FillDerived", 0, 0, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
+          deriv(0, k, j, i) = std::log10(q(0, k, j, i) + 1.0e-5);
+        });
+  }
+}
+
 Packages_t ProcessPackages(std::unique_ptr<ParameterInput> &pin) {
   Packages_t packages;
   auto pkg = advection_package::Initialize(pin.get());
diff --git a/example/advection/parthinput.advection b/example/advection/parthinput.advection
index 60eba290f90b..bdc6def403be 100644
--- a/example/advection/parthinput.advection
+++ b/example/advection/parthinput.advection
@@ -75,7 +75,8 @@ dt = 0.05
 variables = advected, advected_1, & # comments are ok
             one_minus_advected, &
             one_minus_advected_sq, & # on every (& characters are ok in comments)
-            one_minus_sqrt_one_minus_advected_sq # line
+            one_minus_sqrt_one_minus_advected_sq, & # line
+            my_derived_var
 
 <parthenon/output3>
 file_type = hst
diff --git a/example/calculate_pi/calculate_pi.cpp b/example/calculate_pi/calculate_pi.cpp
index 03ef92fd6872..06d6ba3c1984 100644
--- a/example/calculate_pi/calculate_pi.cpp
+++ b/example/calculate_pi/calculate_pi.cpp
@@ -88,7 +88,7 @@ void SetInOrOut(MeshBlockData<Real> *rc) {
   // Loop bounds are set to catch the case where the edge is between the
   // cell centers of the first/last real cell and the first ghost cell
   pmb->par_for(
-      "SetInOrOut", kb.s, kb.e, jb.s - 1, jb.e + 1, ib.s - 1, ib.e + 1,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s - 1, jb.e + 1, ib.s - 1, ib.e + 1,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         Real rsq = std::pow(coords.Xc<1>(i), 2) + std::pow(coords.Xc<2>(j), 2);
         if (rsq < radius * radius) {
diff --git a/example/calculate_pi/pi_driver.cpp b/example/calculate_pi/pi_driver.cpp
index c322c17c0d3f..d2cf02b6b763 100644
--- a/example/calculate_pi/pi_driver.cpp
+++ b/example/calculate_pi/pi_driver.cpp
@@ -90,11 +90,11 @@ parthenon::DriverStatus PiDriver::Execute() {
   // retrieve "pi_val" and post execute.
   auto &pi_val = pmesh->packages.Get("calculate_pi")->Param<Real>("pi_val");
   pmesh->mbcnt = pmesh->nbtotal; // this is how many blocks were processed
-  PostExecute(pi_val);
+  PiPostExecute(pi_val);
   return DriverStatus::complete;
 }
 
-void PiDriver::PostExecute(Real pi_val) {
+void PiDriver::PiPostExecute(Real pi_val) {
   if (my_rank == 0) {
     std::cout << std::endl
               << std::endl
diff --git a/example/calculate_pi/pi_driver.hpp b/example/calculate_pi/pi_driver.hpp
index 7936a4041d6d..7168781c127c 100644
--- a/example/calculate_pi/pi_driver.hpp
+++ b/example/calculate_pi/pi_driver.hpp
@@ -42,7 +42,7 @@ class PiDriver : public Driver {
   DriverStatus Execute() override;
 
  protected:
-  void PostExecute(Real pi_val);
+  void PiPostExecute(Real pi_val);
 };
 
 } // namespace pi
diff --git a/example/kokkos_pi/kokkos_pi.cpp b/example/kokkos_pi/kokkos_pi.cpp
index 4bcdbd8d3819..6d5f69c980bc 100644
--- a/example/kokkos_pi/kokkos_pi.cpp
+++ b/example/kokkos_pi/kokkos_pi.cpp
@@ -272,7 +272,7 @@ result_t naiveParFor(int n_block, int n_mesh, int n_iter, double radius) {
       auto inOrOut = base->PackVariables({Metadata::Independent});
       // iops = 0  fops = 11
       par_for(
-          DEFAULT_LOOP_PATTERN, "par_for in or out", DevExecSpace(), 0,
+          DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
           inOrOut.GetDim(4) - 1, nghost, inOrOut.GetDim(3) - nghost - 1, nghost,
           inOrOut.GetDim(2) - nghost - 1, nghost, inOrOut.GetDim(1) - nghost - 1,
           KOKKOS_LAMBDA(const int l, const int k_grid, const int j_grid,
diff --git a/example/particle_leapfrog/particle_leapfrog.cpp b/example/particle_leapfrog/particle_leapfrog.cpp
index b54bd6ddebd3..139abdcc3a44 100644
--- a/example/particle_leapfrog/particle_leapfrog.cpp
+++ b/example/particle_leapfrog/particle_leapfrog.cpp
@@ -1,9 +1,9 @@
 //========================================================================================
 // Parthenon performance portable AMR framework
-// Copyright(C) 2021-2022 The Parthenon collaboration
+// Copyright(C) 2021-2024 The Parthenon collaboration
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -175,9 +175,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
 
   Kokkos::deep_copy(pmb->exec_space, ids_this_block, ids_this_block_h);
 
-  ParArrayND<int> new_indices;
-  const auto new_particles_mask =
-      swarm->AddEmptyParticles(num_particles_this_block, new_indices);
+  auto new_particles_context = swarm->AddEmptyParticles(num_particles_this_block);
 
   auto &id = swarm->Get<int>("id").Get();
   auto &x = swarm->Get<Real>("x").Get();
@@ -189,7 +187,9 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   // This hardcoded implementation should only used in PGEN and not during runtime
   // addition of particles as indices need to be taken into account.
   pmb->par_for(
-      "CreateParticles", 0, num_particles_this_block - 1, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, new_particles_context.GetNewParticlesMaxIndex(),
+      KOKKOS_LAMBDA(const int new_n) {
+        const int n = new_particles_context.GetNewParticleIndex(new_n);
         const auto &m = ids_this_block(n);
 
         id(n) = m; // global unique id
@@ -227,7 +227,7 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator
   const Real ay = 0.0;
   const Real az = 0.0;
   pmb->par_for(
-      "Leapfrog", 0, max_active_index, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
         if (swarm_d.IsActive(n)) {
           // drift
           x(n) += v(0, n) * 0.5 * dt;
diff --git a/example/particle_tracers/particle_tracers.cpp b/example/particle_tracers/particle_tracers.cpp
index 1f8394a6c6c4..d5cc3cbed6f3 100644
--- a/example/particle_tracers/particle_tracers.cpp
+++ b/example/particle_tracers/particle_tracers.cpp
@@ -1,9 +1,9 @@
 //========================================================================================
 // Parthenon performance portable AMR framework
-// Copyright(C) 2021-2022 The Parthenon collaboration
+// Copyright(C) 2021-2024 The Parthenon collaboration
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2021-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2021-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -182,7 +182,7 @@ TaskStatus AdvectTracers(MeshBlock *pmb, const StagedIntegrator *integrator) {
 
   auto swarm_d = swarm->GetDeviceContext();
   pmb->par_for(
-      "Tracer advection", 0, max_active_index, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
         if (swarm_d.IsActive(n)) {
           x(n) += vx * dt;
           y(n) += vy * dt;
@@ -219,13 +219,13 @@ TaskStatus DepositTracers(MeshBlock *pmb) {
   auto &tracer_dep = pmb->meshblock_data.Get()->Get("tracer_deposition").data;
   // Reset particle count
   pmb->par_for(
-      "ZeroParticleDep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) { tracer_dep(k, j, i) = 0.; });
 
   const int ndim = pmb->pmy_mesh->ndim;
 
   pmb->par_for(
-      "DepositTracers", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
         if (swarm_d.IsActive(n)) {
           int i = static_cast<int>(std::floor((x(n) - minx_i) / dx_i) + ib.s);
           int j = 0;
@@ -269,7 +269,7 @@ TaskStatus CalculateFluxes(MeshBlockData<Real> *mbd) {
 
   // Spatially first order upwind method
   pmb->par_for(
-      "CalculateFluxesX1", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e + 1,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e + 1,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         // X1
         if (vx > 0.) {
@@ -282,7 +282,7 @@ TaskStatus CalculateFluxes(MeshBlockData<Real> *mbd) {
   if (ndim > 1) {
     auto x2flux = mbd->Get("advected").flux[X2DIR].Get<4>();
     pmb->par_for(
-        "CalculateFluxesX2", kb.s, kb.e, jb.s, jb.e + 1, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e + 1, ib.s, ib.e,
         KOKKOS_LAMBDA(const int k, const int j, const int i) {
           // X2
           if (vy > 0.) {
@@ -296,7 +296,7 @@ TaskStatus CalculateFluxes(MeshBlockData<Real> *mbd) {
   if (ndim > 2) {
     auto x3flux = mbd->Get("advected").flux[X3DIR].Get<4>();
     pmb->par_for(
-        "CalculateFluxesX3", kb.s, kb.e + 1, jb.s, jb.e, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, kb.s, kb.e + 1, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int k, const int j, const int i) {
           // X3
           if (vz > 0.) {
@@ -355,7 +355,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   const Real kwave = 2. * M_PI / (x_max_mesh - x_min_mesh);
 
   pmb->par_for(
-      "Init advected profile", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         advected(k, j, i) = advected_mean + advected_amp * sin(kwave * coords.Xc<1>(i));
       });
@@ -375,8 +375,7 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   int num_tracers_meshblock = std::round(num_tracers * number_meshblock / number_mesh);
   int gid = pmb->gid;
 
-  ParArrayND<int> new_indices;
-  swarm->AddEmptyParticles(num_tracers_meshblock, new_indices);
+  auto new_particles_context = swarm->AddEmptyParticles(num_tracers_meshblock);
 
   auto &x = swarm->Get<Real>("x").Get();
   auto &y = swarm->Get<Real>("y").Get();
@@ -384,10 +383,10 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
   auto &id = swarm->Get<int>("id").Get();
 
   auto swarm_d = swarm->GetDeviceContext();
-  // This hardcoded implementation should only used in PGEN and not during runtime
-  // addition of particles as indices need to be taken into account.
   pmb->par_for(
-      "CreateParticles", 0, num_tracers_meshblock - 1, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, new_particles_context.GetNewParticlesMaxIndex(),
+      KOKKOS_LAMBDA(const int new_n) {
+        const int n = new_particles_context.GetNewParticleIndex(new_n);
         auto rng_gen = rng_pool.get_state();
 
         // Rejection sample the x position
diff --git a/example/particles/particles.cpp b/example/particles/particles.cpp
index 5e5648518d3e..0ae0e058a9c1 100644
--- a/example/particles/particles.cpp
+++ b/example/particles/particles.cpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -130,7 +130,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
 // first some helper tasks
 
 TaskStatus DestroySomeParticles(MeshBlock *pmb) {
-  Kokkos::Profiling::pushRegion("Task_Particles_DestroySomeParticles");
+  PARTHENON_INSTRUMENT
 
   auto pkg = pmb->packages.Get("particles_package");
   auto swarm = pmb->swarm_data.Get()->Get("my_particles");
@@ -143,7 +143,7 @@ TaskStatus DestroySomeParticles(MeshBlock *pmb) {
 
   // Randomly mark some fraction of particles each timestep for removal
   pmb->par_for(
-      "DestroySomeParticles", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
         if (swarm_d.IsActive(n)) {
           auto rng_gen = rng_pool.get_state();
           if (rng_gen.drand() > 1.0 - destroy_particles_frac) {
@@ -156,7 +156,6 @@ TaskStatus DestroySomeParticles(MeshBlock *pmb) {
   // Remove marked particles
   swarm->RemoveMarkedParticles();
 
-  Kokkos::Profiling::popRegion(); // Task_Particles_DestroySomeParticles
   return TaskStatus::complete;
 }
 
@@ -172,7 +171,7 @@ TaskStatus SortParticlesIfUsingPerCellDeposition(MeshBlock *pmb) {
 }
 
 TaskStatus DepositParticles(MeshBlock *pmb) {
-  Kokkos::Profiling::pushRegion("Task_Particles_DepositParticles");
+  PARTHENON_INSTRUMENT
 
   auto swarm = pmb->swarm_data.Get()->Get("my_particles");
 
@@ -202,13 +201,13 @@ TaskStatus DepositParticles(MeshBlock *pmb) {
   if (deposition_method == DepositionMethod::per_particle) {
     // Reset particle count
     pmb->par_for(
-        "ZeroParticleDep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int k, const int j, const int i) {
           particle_dep(k, j, i) = 0.;
         });
 
     pmb->par_for(
-        "DepositParticles", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
+        PARTHENON_AUTO_LABEL, 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
           if (swarm_d.IsActive(n)) {
             int i = static_cast<int>(std::floor((x(n) - minx_i) / dx_i) + ib.s);
             int j = 0;
@@ -228,7 +227,7 @@ TaskStatus DepositParticles(MeshBlock *pmb) {
         });
   } else if (deposition_method == DepositionMethod::per_cell) {
     pmb->par_for(
-        "DepositParticlesByCell", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int k, const int j, const int i) {
           particle_dep(k, j, i) = 0.;
           for (int n = 0; n < swarm_d.GetParticleCountPerCell(k, j, i); n++) {
@@ -238,12 +237,11 @@ TaskStatus DepositParticles(MeshBlock *pmb) {
         });
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Particles_DepositParticles
   return TaskStatus::complete;
 }
 
 TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
-  Kokkos::Profiling::pushRegion("Task_Particles_CreateSomeParticles");
+  PARTHENON_INSTRUMENT
 
   auto pkg = pmb->packages.Get("particles_package");
   auto swarm = pmb->swarm_data.Get()->Get("my_particles");
@@ -252,8 +250,8 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
   auto vel = pkg->Param<Real>("particle_speed");
   const auto orbiting_particles = pkg->Param<bool>("orbiting_particles");
 
-  ParArrayND<int> new_indices;
-  const auto new_particles_mask = swarm->AddEmptyParticles(num_particles, new_indices);
+  // Create new particles and get accessor
+  auto newParticlesContext = swarm->AddEmptyParticles(num_particles);
 
   // Meshblock geometry
   const IndexRange &ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
@@ -280,84 +278,82 @@ TaskStatus CreateSomeParticles(MeshBlock *pmb, const double t0) {
 
   if (orbiting_particles) {
     pmb->par_for(
-        "CreateSomeOrbitingParticles", 0, swarm->GetMaxActiveIndex(),
-        KOKKOS_LAMBDA(const int n) {
-          if (new_particles_mask(n)) {
-            auto rng_gen = rng_pool.get_state();
-
-            // Randomly sample in space in this meshblock while staying within 0.5 of
-            // origin
-            Real r;
-            do {
-              x(n) = minx_i + nx_i * dx_i * rng_gen.drand();
-              y(n) = minx_j + nx_j * dx_j * rng_gen.drand();
-              z(n) = minx_k + nx_k * dx_k * rng_gen.drand();
-              r = sqrt(x(n) * x(n) + y(n) * y(n) + z(n) * z(n));
-            } while (r > 0.5);
-
-            // Randomly sample direction perpendicular to origin
-            Real theta = acos(2. * rng_gen.drand() - 1.);
-            Real phi = 2. * M_PI * rng_gen.drand();
-            v(0, n) = sin(theta) * cos(phi);
-            v(1, n) = sin(theta) * sin(phi);
-            v(2, n) = cos(theta);
-            // Project v onto plane normal to sphere
-            Real vdN = v(0, n) * x(n) + v(1, n) * y(n) + v(2, n) * z(n);
-            Real NdN = r * r;
-            v(0, n) = v(0, n) - vdN / NdN * x(n);
-            v(1, n) = v(1, n) - vdN / NdN * y(n);
-            v(2, n) = v(2, n) - vdN / NdN * z(n);
-
-            // Normalize
-            Real v_tmp = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n));
-            PARTHENON_DEBUG_REQUIRE(v_tmp > 0., "Speed must be > 0!");
-            for (int ii = 0; ii < 3; ii++) {
-              v(ii, n) *= vel / v_tmp;
-            }
+        PARTHENON_AUTO_LABEL, 0, newParticlesContext.GetNewParticlesMaxIndex(),
+        KOKKOS_LAMBDA(const int new_n) {
+          const int n = newParticlesContext.GetNewParticleIndex(new_n);
+          auto rng_gen = rng_pool.get_state();
+
+          // Randomly sample in space in this meshblock while staying within 0.5 of
+          // origin
+          Real r;
+          do {
+            x(n) = minx_i + nx_i * dx_i * rng_gen.drand();
+            y(n) = minx_j + nx_j * dx_j * rng_gen.drand();
+            z(n) = minx_k + nx_k * dx_k * rng_gen.drand();
+            r = sqrt(x(n) * x(n) + y(n) * y(n) + z(n) * z(n));
+          } while (r > 0.5);
+
+          // Randomly sample direction perpendicular to origin
+          Real theta = acos(2. * rng_gen.drand() - 1.);
+          Real phi = 2. * M_PI * rng_gen.drand();
+          v(0, n) = sin(theta) * cos(phi);
+          v(1, n) = sin(theta) * sin(phi);
+          v(2, n) = cos(theta);
+          // Project v onto plane normal to sphere
+          Real vdN = v(0, n) * x(n) + v(1, n) * y(n) + v(2, n) * z(n);
+          Real NdN = r * r;
+          v(0, n) = v(0, n) - vdN / NdN * x(n);
+          v(1, n) = v(1, n) - vdN / NdN * y(n);
+          v(2, n) = v(2, n) - vdN / NdN * z(n);
+
+          // Normalize
+          Real v_tmp = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n));
+          PARTHENON_DEBUG_REQUIRE(v_tmp > 0., "Speed must be > 0!");
+          for (int ii = 0; ii < 3; ii++) {
+            v(ii, n) *= vel / v_tmp;
+          }
 
-            // Create particles at the beginning of the timestep
-            t(n) = t0;
+          // Create particles at the beginning of the timestep
+          t(n) = t0;
 
-            weight(n) = 1.0;
+          weight(n) = 1.0;
 
-            rng_pool.free_state(rng_gen);
-          }
+          rng_pool.free_state(rng_gen);
         });
   } else {
     pmb->par_for(
-        "CreateSomeParticles", 0, swarm->GetMaxActiveIndex(), KOKKOS_LAMBDA(const int n) {
-          if (new_particles_mask(n)) {
-            auto rng_gen = rng_pool.get_state();
+        PARTHENON_AUTO_LABEL, 0, newParticlesContext.GetNewParticlesMaxIndex(),
+        KOKKOS_LAMBDA(const int new_n) {
+          const int n = newParticlesContext.GetNewParticleIndex(new_n);
+          auto rng_gen = rng_pool.get_state();
 
-            // Randomly sample in space in this meshblock
-            x(n) = minx_i + nx_i * dx_i * rng_gen.drand();
-            y(n) = minx_j + nx_j * dx_j * rng_gen.drand();
-            z(n) = minx_k + nx_k * dx_k * rng_gen.drand();
+          // Randomly sample in space in this meshblock
+          x(n) = minx_i + nx_i * dx_i * rng_gen.drand();
+          y(n) = minx_j + nx_j * dx_j * rng_gen.drand();
+          z(n) = minx_k + nx_k * dx_k * rng_gen.drand();
 
-            // Randomly sample direction on the unit sphere, fixing speed
-            Real theta = acos(2. * rng_gen.drand() - 1.);
-            Real phi = 2. * M_PI * rng_gen.drand();
-            v(0, n) = vel * sin(theta) * cos(phi);
-            v(1, n) = vel * sin(theta) * sin(phi);
-            v(2, n) = vel * cos(theta);
+          // Randomly sample direction on the unit sphere, fixing speed
+          Real theta = acos(2. * rng_gen.drand() - 1.);
+          Real phi = 2. * M_PI * rng_gen.drand();
+          v(0, n) = vel * sin(theta) * cos(phi);
+          v(1, n) = vel * sin(theta) * sin(phi);
+          v(2, n) = vel * cos(theta);
 
-            // Create particles at the beginning of the timestep
-            t(n) = t0;
+          // Create particles at the beginning of the timestep
+          t(n) = t0;
 
-            weight(n) = 1.0;
+          weight(n) = 1.0;
 
-            rng_pool.free_state(rng_gen);
-          }
+          rng_pool.free_state(rng_gen);
         });
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Particles_CreateSomeParticles
   return TaskStatus::complete;
 }
 
 TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator,
                               const double t0) {
-  Kokkos::Profiling::pushRegion("Task_Particles_TransportParticles");
+  PARTHENON_INSTRUMENT
 
   auto swarm = pmb->swarm_data.Get()->Get("my_particles");
   auto pkg = pmb->packages.Get("particles_package");
@@ -396,7 +392,7 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator
   if (orbiting_particles) {
     // Particles orbit the origin
     pmb->par_for(
-        "TransportOrbitingParticles", 0, max_active_index, KOKKOS_LAMBDA(const int n) {
+        PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
           if (swarm_d.IsActive(n)) {
             Real vel = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n));
             PARTHENON_DEBUG_REQUIRE(vel > 0., "Speed must be > 0!");
@@ -449,7 +445,7 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator
   } else {
     // Particles move in straight lines
     pmb->par_for(
-        "TransportParticles", 0, max_active_index, KOKKOS_LAMBDA(const int n) {
+        PARTHENON_AUTO_LABEL, 0, max_active_index, KOKKOS_LAMBDA(const int n) {
           if (swarm_d.IsActive(n)) {
             Real vel = sqrt(v(0, n) * v(0, n) + v(1, n) * v(1, n) + v(2, n) * v(2, n));
             PARTHENON_DEBUG_REQUIRE(vel > 0., "vel must be > 0 for division!");
@@ -478,7 +474,6 @@ TaskStatus TransportParticles(MeshBlock *pmb, const StagedIntegrator *integrator
         });
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Particles_TransportParticles
   return TaskStatus::complete;
 }
 
@@ -520,7 +515,7 @@ TaskListStatus ParticleDriver::Step() {
 // TODO(BRR) This should really be in parthenon/src... but it can't just live in Swarm
 // because of the loop over blocks
 TaskStatus StopCommunicationMesh(const BlockList_t &blocks) {
-  Kokkos::Profiling::pushRegion("Task_Particles_StopCommunicationMesh");
+  PARTHENON_INSTRUMENT
 
   int num_sent_local = 0;
   for (auto &block : blocks) {
@@ -574,7 +569,6 @@ TaskStatus StopCommunicationMesh(const BlockList_t &blocks) {
     }
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Particles_StopCommunicationMesh
   return TaskStatus::complete;
 }
 
diff --git a/example/poisson/parthenon_app_inputs.cpp b/example/poisson/parthenon_app_inputs.cpp
index 4c2bd177c42f..2dd29551f350 100644
--- a/example/poisson/parthenon_app_inputs.cpp
+++ b/example/poisson/parthenon_app_inputs.cpp
@@ -50,8 +50,8 @@ void ProblemGenerator(Mesh *pm, ParameterInput *pin, MeshData<Real> *md) {
   const int iphi = imap["potential"].first;
 
   pmb->par_for(
-      "Poisson::ProblemGenerator", 0, q_bpack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-      ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
+      PARTHENON_AUTO_LABEL, 0, q_bpack.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
         const auto &coords = q_bpack.GetCoords(b);
         auto &q = q_bpack(b);
         Real dist2 = std::pow(coords.Xc<1>(i) - x0, 2) +
diff --git a/example/poisson/poisson_driver.cpp b/example/poisson/poisson_driver.cpp
index a94f6874ab70..e2ec03a354d3 100644
--- a/example/poisson/poisson_driver.cpp
+++ b/example/poisson/poisson_driver.cpp
@@ -70,9 +70,7 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) {
   // and a kokkos view just for fun
   AllReduce<HostArray1D<Real>> *pview_reduce =
       pkg->MutableParam<AllReduce<HostArray1D<Real>>>("view_reduce");
-  int reg_dep_id;
   for (int i = 0; i < num_partitions; i++) {
-    reg_dep_id = 0;
     // make/get a mesh_data container for the state
     auto &md = pmesh->mesh_data.GetOrAdd("base", i);
     auto &mdelta = pmesh->mesh_data.GetOrAdd("delta", i);
@@ -81,101 +79,83 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) {
 
     //--- Demo a few reductions
     // pass a pointer to the variable being reduced into
-    auto loc_red = tl.AddTask(none, poisson_package::SumMass<MeshData<Real>>, md.get(),
-                              &total_mass.val);
-    // make it a regional dependency so dependent tasks can't execute until all lists do
-    // this
-    solver_region.AddRegionalDependencies(reg_dep_id, i, loc_red);
-    reg_dep_id++;
+    auto loc_red =
+        tl.AddTask(TaskQualifier::local_sync, none,
+                   poisson_package::SumMass<MeshData<Real>>, md.get(), &total_mass.val);
 
     auto rank_red = tl.AddTask(
-        none,
+        TaskQualifier::local_sync, none,
         [](int *max_rank) {
           *max_rank = std::max(*max_rank, Globals::my_rank);
           return TaskStatus::complete;
         },
         &max_rank.val);
-    solver_region.AddRegionalDependencies(reg_dep_id, i, rank_red);
-    reg_dep_id++;
 
     // start a non-blocking MPI_Iallreduce
     auto start_global_reduce =
-        (i == 0 ? tl.AddTask(loc_red, &AllReduce<Real>::StartReduce, &total_mass, MPI_SUM)
-                : none);
+        tl.AddTask(TaskQualifier::once_per_region, loc_red, &AllReduce<Real>::StartReduce,
+                   &total_mass, MPI_SUM);
 
-    auto start_rank_reduce =
-        (i == 0 ? tl.AddTask(rank_red, &Reduce<int>::StartReduce, &max_rank, 0, MPI_MAX)
-                : none);
+    auto start_rank_reduce = tl.AddTask(TaskQualifier::once_per_region, rank_red,
+                                        &Reduce<int>::StartReduce, &max_rank, 0, MPI_MAX);
 
     // test the reduction until it completes
     auto finish_global_reduce =
-        tl.AddTask(start_global_reduce, &AllReduce<Real>::CheckReduce, &total_mass);
-    solver_region.AddRegionalDependencies(reg_dep_id, i, finish_global_reduce);
-    reg_dep_id++;
+        tl.AddTask(TaskQualifier::local_sync | TaskQualifier::once_per_region,
+                   start_global_reduce, &AllReduce<Real>::CheckReduce, &total_mass);
 
     auto finish_rank_reduce =
-        tl.AddTask(start_rank_reduce, &Reduce<int>::CheckReduce, &max_rank);
-    solver_region.AddRegionalDependencies(reg_dep_id, i, finish_rank_reduce);
-    reg_dep_id++;
+        tl.AddTask(TaskQualifier::local_sync | TaskQualifier::once_per_region,
+                   start_rank_reduce, &Reduce<int>::CheckReduce, &max_rank);
 
     // notice how we must always pass a pointer to the reduction value
     // since tasks capture args by value, this would print zero if we just passed in
     // the val since the tasks that compute the value haven't actually executed yet
-    auto report_mass = (i == 0 && Globals::my_rank == 0
-                            ? tl.AddTask(
-                                  finish_global_reduce,
-                                  [](Real *mass) {
-                                    std::cout << "Total mass = " << *mass << std::endl;
-                                    return TaskStatus::complete;
-                                  },
-                                  &total_mass.val)
-                            : none);
-    auto report_rank = (i == 0 && Globals::my_rank == 0
-                            ? tl.AddTask(
-                                  finish_rank_reduce,
-                                  [](int *max_rank) {
-                                    std::cout << "Max rank = " << *max_rank << std::endl;
-                                    return TaskStatus::complete;
-                                  },
-                                  &max_rank.val)
-                            : none);
+    auto report_mass = tl.AddTask(
+        TaskQualifier::once_per_region, finish_global_reduce,
+        [](Real *mass) {
+          if (Globals::my_rank == 0) std::cout << "Total mass = " << *mass << std::endl;
+          return TaskStatus::complete;
+        },
+        &total_mass.val);
+    auto report_rank = tl.AddTask(
+        TaskQualifier::once_per_region, finish_rank_reduce,
+        [](int *max_rank) {
+          if (Globals::my_rank == 0) std::cout << "Max rank = " << *max_rank << std::endl;
+          return TaskStatus::complete;
+        },
+        &max_rank.val);
 
     //--- Begining of tasks related to solving the Poisson eq.
     auto mat_elem =
         tl.AddTask(none, poisson_package::SetMatrixElements<MeshData<Real>>, md.get());
 
-    auto &solver = tl.AddIteration("poisson solver");
-    solver.SetMaxIterations(max_iters);
-    solver.SetCheckInterval(check_interval);
-    solver.SetFailWithMaxIterations(fail_flag);
-    solver.SetWarnWithMaxIterations(warn_flag);
+    auto [solver, solver_id] = tl.AddSublist(mat_elem, {1, max_iters});
 
     auto start_recv = solver.AddTask(none, parthenon::StartReceiveBoundaryBuffers, md);
 
-    auto update = solver.AddTask(mat_elem, poisson_package::UpdatePhi<MeshData<Real>>,
+    auto update = solver.AddTask(none, poisson_package::UpdatePhi<MeshData<Real>>,
                                  md.get(), mdelta.get());
 
-    auto norm = solver.AddTask(update, poisson_package::SumDeltaPhi<MeshData<Real>>,
-                               mdelta.get(), &update_norm.val);
-    solver_region.AddRegionalDependencies(reg_dep_id, i, norm);
-    reg_dep_id++;
-    auto start_reduce_norm = (i == 0 ? solver.AddTask(norm, &AllReduce<Real>::StartReduce,
-                                                      &update_norm, MPI_SUM)
-                                     : none);
+    auto norm = solver.AddTask(TaskQualifier::local_sync, update,
+                               poisson_package::SumDeltaPhi<MeshData<Real>>, mdelta.get(),
+                               &update_norm.val);
+    auto start_reduce_norm =
+        solver.AddTask(TaskQualifier::once_per_region, norm,
+                       &AllReduce<Real>::StartReduce, &update_norm, MPI_SUM);
     auto finish_reduce_norm =
-        solver.AddTask(start_reduce_norm, &AllReduce<Real>::CheckReduce, &update_norm);
-    auto report_norm = (i == 0 ? solver.AddTask(
-                                     finish_reduce_norm,
-                                     [](Real *norm) {
-                                       if (Globals::my_rank == 0) {
-                                         std::cout << "Update norm = " << *norm
-                                                   << std::endl;
-                                       }
-                                       *norm = 0.0;
-                                       return TaskStatus::complete;
-                                     },
-                                     &update_norm.val)
-                               : none);
+        solver.AddTask(TaskQualifier::once_per_region, start_reduce_norm,
+                       &AllReduce<Real>::CheckReduce, &update_norm);
+    auto report_norm = solver.AddTask(
+        TaskQualifier::once_per_region, finish_reduce_norm,
+        [](Real *norm) {
+          if (Globals::my_rank == 0) {
+            std::cout << "Update norm = " << *norm << std::endl;
+          }
+          *norm = 0.0;
+          return TaskStatus::complete;
+        },
+        &update_norm.val);
 
     auto send = solver.AddTask(update, SendBoundaryBuffers, md);
 
@@ -183,24 +163,18 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) {
 
     auto setb = solver.AddTask(recv | update, SetBoundaries, md);
 
-    auto check = solver.SetCompletionTask(
-        send | setb | report_norm, poisson_package::CheckConvergence<MeshData<Real>>,
-        md.get(), mdelta.get());
-    // mark task so that dependent tasks (below) won't execute
-    // until all task lists have completed it
-    solver_region.AddRegionalDependencies(reg_dep_id, i, check);
-    reg_dep_id++;
-
-    auto print = none;
-    if (i == 0) { // only print once
-      print = tl.AddTask(check, poisson_package::PrintComplete);
-    }
+    auto check = solver.AddTask(
+        TaskQualifier::completion | TaskQualifier::global_sync, send | setb | report_norm,
+        poisson_package::CheckConvergence<MeshData<Real>>, md.get(), mdelta.get());
+
+    auto print = tl.AddTask(TaskQualifier::once_per_region, solver_id,
+                            poisson_package::PrintComplete);
     //--- End of tasks related to solving the Poisson eq
 
     // do a vector reduction (everything below here), just for fun
     // first fill it in
     auto fill_vec = tl.AddTask(
-        none,
+        TaskQualifier::local_sync, none,
         [](std::vector<int> *vec) {
           auto &v = *vec;
           for (int n = 0; n < v.size(); n++)
@@ -208,72 +182,64 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) {
           return TaskStatus::complete;
         },
         &vec_reduce.val);
-    solver_region.AddRegionalDependencies(reg_dep_id, i, fill_vec);
-    reg_dep_id++;
 
     TaskID start_vec_reduce =
-        (i == 0 ? tl.AddTask(fill_vec, &AllReduce<std::vector<int>>::StartReduce,
-                             &vec_reduce, MPI_SUM)
-                : none);
+        tl.AddTask(TaskQualifier::once_per_region, fill_vec,
+                   &AllReduce<std::vector<int>>::StartReduce, &vec_reduce, MPI_SUM);
     // test the reduction until it completes
     TaskID finish_vec_reduce = tl.AddTask(
-        start_vec_reduce, &AllReduce<std::vector<int>>::CheckReduce, &vec_reduce);
-    solver_region.AddRegionalDependencies(reg_dep_id, i, finish_vec_reduce);
-    reg_dep_id++;
-
-    auto report_vec = (i == 0 && Globals::my_rank == 0
-                           ? tl.AddTask(
-                                 finish_vec_reduce,
-                                 [num_partitions](std::vector<int> *vec) {
-                                   auto &v = *vec;
-                                   std::cout << "Vec reduction: ";
-                                   for (int n = 0; n < v.size(); n++) {
-                                     std::cout << v[n] << " ";
-                                   }
-                                   std::cout << std::endl;
-                                   std::cout << "Should be:     ";
-                                   for (int n = 0; n < v.size(); n++) {
-                                     std::cout << n * num_partitions * Globals::nranks
-                                               << " ";
-                                   }
-                                   std::cout << std::endl;
-                                   return TaskStatus::complete;
-                                 },
-                                 &vec_reduce.val)
-                           : none);
+        TaskQualifier::once_per_region | TaskQualifier::local_sync, start_vec_reduce,
+        &AllReduce<std::vector<int>>::CheckReduce, &vec_reduce);
+
+    auto report_vec = tl.AddTask(
+        TaskQualifier::once_per_region, finish_vec_reduce,
+        [num_partitions](std::vector<int> *vec) {
+          if (Globals::my_rank == 0) {
+            auto &v = *vec;
+            std::cout << "Vec reduction: ";
+            for (int n = 0; n < v.size(); n++) {
+              std::cout << v[n] << " ";
+            }
+            std::cout << std::endl;
+            std::cout << "Should be:     ";
+            for (int n = 0; n < v.size(); n++) {
+              std::cout << n * num_partitions * Globals::nranks << " ";
+            }
+            std::cout << std::endl;
+          }
+          return TaskStatus::complete;
+        },
+        &vec_reduce.val);
 
     // And lets do a view reduce too just for fun
     // The views are filled in the package
     TaskID start_view_reduce =
-        (i == 0 ? tl.AddTask(none, &AllReduce<HostArray1D<Real>>::StartReduce,
-                             pview_reduce, MPI_SUM)
-                : none);
+        tl.AddTask(TaskQualifier::once_per_region, none,
+                   &AllReduce<HostArray1D<Real>>::StartReduce, pview_reduce, MPI_SUM);
     // test the reduction until it completes
     TaskID finish_view_reduce = tl.AddTask(
-        start_view_reduce, &AllReduce<HostArray1D<Real>>::CheckReduce, pview_reduce);
-    solver_region.AddRegionalDependencies(reg_dep_id, i, finish_view_reduce);
-    reg_dep_id++;
-
-    auto report_view = (i == 0 && Globals::my_rank == 0
-                            ? tl.AddTask(
-                                  finish_view_reduce,
-                                  [num_partitions](HostArray1D<Real> *view) {
-                                    auto &v = *view;
-                                    std::cout << "View reduction: ";
-                                    for (int n = 0; n < v.size(); n++) {
-                                      std::cout << v(n) << " ";
-                                    }
-                                    std::cout << std::endl;
-                                    std::cout << "Should be:     ";
-                                    for (int n = 0; n < v.size(); n++) {
-                                      std::cout << n * num_partitions * Globals::nranks
-                                                << " ";
-                                    }
-                                    std::cout << std::endl;
-                                    return TaskStatus::complete;
-                                  },
-                                  &(pview_reduce->val))
-                            : none);
+        TaskQualifier::once_per_region | TaskQualifier::local_sync, start_view_reduce,
+        &AllReduce<HostArray1D<Real>>::CheckReduce, pview_reduce);
+
+    auto report_view = tl.AddTask(
+        TaskQualifier::once_per_region, finish_view_reduce,
+        [num_partitions](HostArray1D<Real> *view) {
+          if (Globals::my_rank == 0) {
+            auto &v = *view;
+            std::cout << "View reduction: ";
+            for (int n = 0; n < v.size(); n++) {
+              std::cout << v(n) << " ";
+            }
+            std::cout << std::endl;
+            std::cout << "Should be:     ";
+            for (int n = 0; n < v.size(); n++) {
+              std::cout << n * num_partitions * Globals::nranks << " ";
+            }
+            std::cout << std::endl;
+          }
+          return TaskStatus::complete;
+        },
+        &(pview_reduce->val));
   }
 
   return tc;
diff --git a/example/poisson/poisson_package.cpp b/example/poisson/poisson_package.cpp
index 7037c42b2061..047868ec86f5 100644
--- a/example/poisson/poisson_package.cpp
+++ b/example/poisson/poisson_package.cpp
@@ -129,8 +129,8 @@ TaskStatus SetMatrixElements(T *u) {
   const int ndim = v.GetNdim();
   const Real w0 = -2.0 * ndim;
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "SetMatElem", DevExecSpace(), 0, v.GetDim(5) - 1, kb.s, kb.e,
-      jb.s, jb.e, ib.s, ib.e,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, v.GetDim(5) - 1,
+      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
         for (int n = isp_lo; n <= isp_hi; n++) {
           v(b, n, k, j, i) = 1;
@@ -169,8 +169,8 @@ TaskStatus SumMass(T *u, Real *reduce_sum) {
 
   Real total;
   parthenon::par_reduce(
-      parthenon::loop_pattern_mdrange_tag, "SumMass", DevExecSpace(), 0, v.GetDim(5) - 1,
-      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
+      v.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &sum) {
         sum += v(b, irho, k, j, i) * std::pow(dx, ndim);
       },
@@ -195,8 +195,8 @@ TaskStatus SumDeltaPhi(T *du, Real *reduce_sum) {
 
   Real total;
   parthenon::par_reduce(
-      parthenon::loop_pattern_mdrange_tag, "SumMass", DevExecSpace(), 0, dv.GetDim(5) - 1,
-      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
+      dv.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &sum) {
         sum += std::pow(dv(b, iphi, k, j, i), 2);
       },
@@ -208,7 +208,7 @@ TaskStatus SumDeltaPhi(T *du, Real *reduce_sum) {
 template <typename T>
 TaskStatus UpdatePhi(T *u, T *du) {
   using Stencil_t = parthenon::solvers::Stencil<Real>;
-  Kokkos::Profiling::pushRegion("Task_Poisson_UpdatePhi");
+  PARTHENON_INSTRUMENT
   auto pm = u->GetParentPointer();
 
   IndexRange ib = u->GetBoundsI(IndexDomain::interior);
@@ -246,8 +246,8 @@ TaskStatus UpdatePhi(T *u, T *du) {
   if (isp_hi < 0) { // there is no sparse matrix, so we must be using the stencil
     const auto &stencil = pkg->Param<Stencil_t>("stencil");
     parthenon::par_for(
-        DEFAULT_LOOP_PATTERN, "StencilJacobi", DevExecSpace(), 0, v.GetDim(5) - 1, kb.s,
-        kb.e, jb.s, jb.e, ib.s, ib.e,
+        DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, v.GetDim(5) - 1,
+        kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
           const Real rhs = dV * v(b, irho, k, j, i);
           const Real phi_new = stencil.Jacobi(v, iphi, b, k, j, i, rhs);
@@ -258,8 +258,8 @@ TaskStatus UpdatePhi(T *u, T *du) {
     const auto &sp_accessor =
         pkg->Param<parthenon::solvers::SparseMatrixAccessor>("sparse_accessor");
     parthenon::par_for(
-        DEFAULT_LOOP_PATTERN, "SparseUpdate", DevExecSpace(), 0, v.GetDim(5) - 1, kb.s,
-        kb.e, jb.s, jb.e, ib.s, ib.e,
+        DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, v.GetDim(5) - 1,
+        kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
           const Real rhs = dV * v(b, irho, k, j, i);
           const Real phi_new =
@@ -269,19 +269,18 @@ TaskStatus UpdatePhi(T *u, T *du) {
   }
 
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "UpdatePhi", DevExecSpace(), 0, dv.GetDim(5) - 1, kb.s, kb.e,
-      jb.s, jb.e, ib.s, ib.e,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, dv.GetDim(5) - 1,
+      kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
         v(b, iphi, k, j, i) += dv(b, idphi, k, j, i);
       });
 
-  Kokkos::Profiling::popRegion(); // Task_Poisson_UpdatePhi
   return TaskStatus::complete;
 }
 
 template <typename T>
 TaskStatus CheckConvergence(T *u, T *du) {
-  Kokkos::Profiling::pushRegion("Task_Poisson_UpdatePhi");
+  PARTHENON_INSTRUMENT
   auto pm = u->GetParentPointer();
 
   IndexRange ib = u->GetBoundsI(IndexDomain::interior);
@@ -298,7 +297,7 @@ TaskStatus CheckConvergence(T *u, T *du) {
 
   Real max_err;
   parthenon::par_reduce(
-      parthenon::loop_pattern_mdrange_tag, "CheckConvergence", DevExecSpace(), 0,
+      parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
       v.GetDim(5) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int k, const int j, const int i, Real &eps) {
         Real reps = std::abs(dv(b, idphi, k, j, i) / v(b, iphi, k, j, i));
@@ -312,7 +311,6 @@ TaskStatus CheckConvergence(T *u, T *du) {
 
   auto status = (max_err < err_tol ? TaskStatus::complete : TaskStatus::iterate);
 
-  Kokkos::Profiling::popRegion(); // Task_Poisson_CheckConvergence
   return status;
 }
 
diff --git a/example/poisson_gmg/parthinput.poisson b/example/poisson_gmg/parthinput.poisson
index 7ec0878059ec..57f5febf871b 100644
--- a/example/poisson_gmg/parthinput.poisson
+++ b/example/poisson_gmg/parthinput.poisson
@@ -25,14 +25,14 @@ multigrid = true
 nx1 = 64
 x1min = -1.0
 x1max = 1.0
-ix1_bc = user
-ox1_bc = user
+ix1_bc = outflow
+ox1_bc = outflow
 
 nx2 = 64
 x2min = -1.0
 x2max = 1.0
-ix2_bc = user
-ox2_bc = user
+ix2_bc = outflow
+ox2_bc = outflow
 
 nx3 = 1
 x3min = 0.0
diff --git a/example/poisson_gmg/poisson_driver.cpp b/example/poisson_gmg/poisson_driver.cpp
index 42c43b7bde54..dfe177b26310 100644
--- a/example/poisson_gmg/poisson_driver.cpp
+++ b/example/poisson_gmg/poisson_driver.cpp
@@ -99,13 +99,12 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) {
     auto zero_u = tl.AddTask(get_rhs, solvers::utils::SetToZero<u>, md);
 
     auto solve = zero_u;
-    auto &itl = tl.AddIteration("Solver");
     if (solver == "BiCGSTAB") {
       auto setup = bicgstab_solver->AddSetupTasks(region, tl, zero_u, i, reg_dep_id, pmesh);
-      solve = bicgstab_solver->AddTasks(tl, itl, setup, i, pmesh, region, reg_dep_id);
+      solve = bicgstab_solver->AddTasks(tl, setup, pmesh, i);
     } else if (solver == "MG") {
       auto setup = mg_solver->AddSetupTasks(region, tl, zero_u, i, reg_dep_id, pmesh);
-      solve = mg_solver->AddTasks(tl, itl, solve, i, pmesh, region, reg_dep_id);
+      solve = mg_solver->AddTasks(tl, setup, pmesh, i);
     } else {
       PARTHENON_FAIL("Unknown solver type.");
     }
@@ -115,8 +114,7 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) {
     if (use_exact_rhs) {
       auto diff = tl.AddTask(solve, solvers::utils::AddFieldsAndStore<exact, u, u>, md,
                              1.0, -1.0);
-      auto get_err =
-          solvers::utils::DotProduct<u, u>(diff, region, tl, i, reg_dep_id, &err, md);
+      auto get_err = solvers::utils::DotProduct<u, u>(diff, tl, &err, md);
       tl.AddTask(
           get_err,
           [](PoissonDriver *driver, int partition) {
diff --git a/example/sparse_advection/parthenon_app_inputs.cpp b/example/sparse_advection/parthenon_app_inputs.cpp
index 1cd806accad6..0f9730d7f718 100644
--- a/example/sparse_advection/parthenon_app_inputs.cpp
+++ b/example/sparse_advection/parthenon_app_inputs.cpp
@@ -97,8 +97,8 @@ void ProblemGenerator(MeshBlock *pmb, ParameterInput *pin) {
       }
 
       pmb->par_for(
-          "SparseAdvection::ProblemGenerator", 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e,
-          ib.s, ib.e, KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
+          PARTHENON_AUTO_LABEL, 0, v.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+          KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
             auto x = coords.Xc<1>(i) - x0;
             auto y = coords.Xc<2>(j) - y0;
             auto z = coords.Xc<3>(k);
diff --git a/example/sparse_advection/sparse_advection_package.cpp b/example/sparse_advection/sparse_advection_package.cpp
index 01ec9c62475b..0ffeea6ca96d 100644
--- a/example/sparse_advection/sparse_advection_package.cpp
+++ b/example/sparse_advection/sparse_advection_package.cpp
@@ -182,7 +182,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
 TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   using parthenon::MetadataFlag;
 
-  Kokkos::Profiling::pushRegion("Task_Advection_CalculateFluxes");
+  PARTHENON_INSTRUMENT
   auto pmb = rc->GetBlockPointer();
 
   IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
@@ -203,8 +203,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   size_t scratch_size_in_bytes = parthenon::ScratchPad2D<Real>::shmem_size(nvar, nx1);
   // get x-fluxes
   pmb->par_for_outer(
-      "x1 flux", 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e,
-      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+      PARTHENON_AUTO_LABEL, 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+      jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
         parthenon::ScratchPad2D<Real> ql(member.team_scratch(scratch_level), nvar, nx1);
         parthenon::ScratchPad2D<Real> qr(member.team_scratch(scratch_level), nvar, nx1);
 
@@ -226,8 +226,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   // get y-fluxes
   if (pmb->pmy_mesh->ndim >= 2) {
     pmb->par_for_outer(
-        "x2 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e + 1,
-        KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+        jb.e + 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clearly inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
           // reconstructed arrays for different `j` with `j` being part of the outer
@@ -257,7 +257,6 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   PARTHENON_REQUIRE_THROWS(pmb->pmy_mesh->ndim == 2,
                            "Sparse Advection example must be 2D");
 
-  Kokkos::Profiling::popRegion(); // Task_Advection_CalculateFluxes
   return TaskStatus::complete;
 }
 
diff --git a/example/stochastic_subgrid/stochastic_subgrid_package.cpp b/example/stochastic_subgrid/stochastic_subgrid_package.cpp
index 6ab1aad2501c..f91bf0ad7b75 100644
--- a/example/stochastic_subgrid/stochastic_subgrid_package.cpp
+++ b/example/stochastic_subgrid/stochastic_subgrid_package.cpp
@@ -245,11 +245,9 @@ AmrTag CheckRefinement(MeshBlockData<Real> *rc) {
 // randomly sample an interation number for each cell from the discrete power-law
 // distribution
 TaskStatus ComputeNumIter(std::shared_ptr<MeshData<Real>> &md, Packages_t &packages) {
-  Kokkos::Profiling::pushRegion("Task_ComputeNumIter");
+  PARTHENON_INSTRUMENT
 
-  Kokkos::Profiling::pushRegion("Task_ComputeNumIter_pack");
   auto pack = md->PackVariables(std::vector<std::string>({"num_iter"}));
-  Kokkos::Profiling::popRegion();
 
   auto pkg = packages.Get("stochastic_subgrid_package");
   const auto &pool =
@@ -264,9 +262,9 @@ TaskStatus ComputeNumIter(std::shared_ptr<MeshData<Real>> &md, Packages_t &packa
   int N_min = pkg->Param<int>("N_min");
 
   par_for(
-      parthenon::loop_pattern_mdrange_tag, "ComputeNumIter", parthenon::DevExecSpace(), 0,
-      pack.GetDim(5) - 1, 0, pack.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
-      KOKKOS_LAMBDA(int b, int v, int k, int j, int i) {
+      parthenon::loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL,
+      parthenon::DevExecSpace(), 0, pack.GetDim(5) - 1, 0, pack.GetDim(4) - 1, kb.s, kb.e,
+      jb.s, jb.e, ib.s, ib.e, KOKKOS_LAMBDA(int b, int v, int k, int j, int i) {
         auto rng = pool.get_state();
         double rand1 = rng.drand();
         double rand2 = rng.drand();
@@ -276,7 +274,6 @@ TaskStatus ComputeNumIter(std::shared_ptr<MeshData<Real>> &md, Packages_t &packa
         pack(b, v, k, j, i) = num_iter;
       });
 
-  Kokkos::Profiling::popRegion(); // Task_ComputeNumIter
   return TaskStatus::complete;
 }
 
@@ -305,7 +302,7 @@ void DoLotsOfWork(MeshBlockData<Real> *rc) {
   const Real ilog10 = 1.0 / log(10.0);
 
   pmb->par_for(
-      "stochastic_subgrid_package::DoLotsOfWork", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         int num_iter = v(niter, k, j, i);
 
@@ -346,7 +343,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
   // this is obviously overkill for this constant velocity problem
   Real min_dt;
   pmb->par_reduce(
-      "stochastic_subgrid_package::EstimateTimestep", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i, Real &lmin_dt) {
         if (vx != 0.0)
           lmin_dt = std::min(lmin_dt, coords.Dxc<X1DIR>(k, j, i) / std::abs(vx));
@@ -364,7 +361,7 @@ Real EstimateTimestepBlock(MeshBlockData<Real> *rc) {
 // some field "advected" that we are pushing around.
 // This routine implements all the "physics" in this example
 TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
-  Kokkos::Profiling::pushRegion("Task_Advection_CalculateFluxes");
+  PARTHENON_INSTRUMENT
   auto pmb = rc->GetBlockPointer();
   const IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
   const IndexRange jb = pmb->cellbounds.GetBoundsJ(IndexDomain::interior);
@@ -383,8 +380,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   parthenon::ParArray4D<Real> x1flux = rc->Get("advected").flux[X1DIR].Get<4>();
   // get x-fluxes
   pmb->par_for_outer(
-      "x1 flux", 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e,
-      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+      PARTHENON_AUTO_LABEL, 2 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+      jb.e, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
         parthenon::ScratchPad2D<Real> ql(member.team_scratch(scratch_level), nvar, nx1);
         parthenon::ScratchPad2D<Real> qr(member.team_scratch(scratch_level), nvar, nx1);
         // get reconstructed state on faces
@@ -407,8 +404,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   if (pmb->pmy_mesh->ndim >= 2) {
     parthenon::ParArray4D<Real> x2flux = rc->Get("advected").flux[X2DIR].Get<4>();
     pmb->par_for_outer(
-        "x2 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s, jb.e + 1,
-        KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
+        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e, jb.s,
+        jb.e + 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clear inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
           // reconstructed arrays for different `j` with `j` being part of the outer
@@ -439,7 +436,8 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
   if (pmb->pmy_mesh->ndim == 3) {
     parthenon::ParArray4D<Real> x3flux = rc->Get("advected").flux[X3DIR].Get<4>();
     pmb->par_for_outer(
-        "x3 flux", 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1, jb.s, jb.e,
+        PARTHENON_AUTO_LABEL, 3 * scratch_size_in_bytes, scratch_level, kb.s, kb.e + 1,
+        jb.s, jb.e,
         KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int k, const int j) {
           // the overall algorithm/use of scratch pad here is clear inefficient and kept
           // just for demonstrating purposes. The key point is that we cannot reuse
@@ -467,7 +465,6 @@ TaskStatus CalculateFluxes(std::shared_ptr<MeshBlockData<Real>> &rc) {
         });
   }
 
-  Kokkos::Profiling::popRegion(); // Task_Advection_CalculateFluxes
   return TaskStatus::complete;
 }
 
diff --git a/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py b/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py
new file mode 100644
index 000000000000..e23a40bbf72b
--- /dev/null
+++ b/scripts/python/packages/parthenon_process_kernel_timer/process_timer.py
@@ -0,0 +1,164 @@
+# =========================================================================================
+# (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+#
+# This program was produced under U.S. Government contract 89233218CNA000001 for Los
+# Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+# for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+# in the program are reserved by Triad National Security, LLC, and the U.S. Department
+# of Energy/National Nuclear Security Administration. The Government is granted for
+# itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+# license in this material to reproduce, prepare derivative works, distribute copies to
+# the public, perform publicly and display publicly, and to permit others to do so.
+# =========================================================================================
+
+"""
+Process the text profiling data spit out by the kokkos simple kernel timer, arranging things
+in a convenient way, assuming all the region/kernel names are auto-generated by parthenon
+"""
+
+import sys
+
+
+class region:
+    def __init__(self, fl, line, data):
+        self.file = fl
+        self.line = line
+        self.data = data
+
+
+class func:
+    def __init__(self, name):
+        self.name = name
+        self.file = ""
+        self.reg = {}
+        self.reg_type = {}
+        self.total_time = 0.0
+        self.total_pct = 0.0
+        self.total_kernel_time = 0.0
+
+    def add_region(self, fl, line, data):
+        if self.file == "":
+            self.file = fl
+        else:
+            if fl != self.file:
+                print("Duplicate function name found")
+                sys.exit(1)
+        dsplit = data.split()
+        self.reg_type[line] = dsplit[0][1:-1]
+        self.reg[line] = [float(dsplit[1]), float(dsplit[5].rstrip())]
+        if self.reg_type[line] != "REGION":
+            self.total_kernel_time += self.reg[line][0]
+
+    def compute_total_cost(self):
+        if len(self.reg) == 1:
+            for key in self.reg.keys():
+                if self.reg_type[key] != "REGION":
+                    self.name += "   ***WARNING: no function-level profiling***"
+                self.total_time = self.reg[key][0]
+                self.total_pct = self.reg[key][1]
+            return
+        # sort by line number
+        keys = list(self.reg.keys())
+        key_int = [int(k) for k in keys]
+        key_int.sort()
+        keys = [str(k) for k in key_int]
+        if self.reg_type[keys[0]] == "REGION":
+            # assume this is the time for the function
+            self.total_time = self.reg[keys[0]][0]
+            self.total_pct = self.reg[keys[0]][1]
+        else:
+            # assume there are just kernel timers
+            self.name += "   ***WARNING: no function-level profiling***"
+            wall = 0.0
+            for key, val in self.reg.items():
+                if self.reg_type[key] != "REGION":
+                    self.total_time += self.reg[key][0]
+                    wall = self.reg[key][0] / self.reg[key][1]
+            self.total_pct = self.total_time / wall
+
+    def print_region(self):
+        print(
+            self.name, "time: " + str(self.total_time), "%wall: " + str(self.total_pct)
+        )
+        for key, val in self.reg.items():
+            if val[0] != self.total_time:
+                print(
+                    "  ",
+                    "type: " + self.reg_type[key],
+                    "line: " + key,
+                    "selftime: " + str(val[0]),
+                    "%func: " + str(100 * val[0] / self.total_time),
+                )
+        if self.total_time > 0:
+            print(
+                "  ",
+                "Kernel summary: Total kernel time: " + str(self.total_kernel_time),
+                "  % Time in kernels: ",
+                str(100 * self.total_kernel_time / self.total_time),
+            )
+        else:
+            print("  ", "Apparently this function took zero time to execute???")
+
+
+def parse_name(s):
+    if s[0:6] == "Kokkos":
+        f = s.rstrip()
+        line = "na"
+        fl = "na"
+        label = f
+    else:
+        words = s.split("::")
+        if words[0] == "kokkos_abstraction.hpp":
+            return "", "", "", "", True
+        # make sure it follows the filename::line_number::name convection
+        if ".cpp" in s or ".hpp" in s:
+            # now strip away the file and line
+            fl = s[: s.find(":")]
+            f = s[s.find(":") + 2 :]
+            line = f[: f.find(":")]
+            f = f[f.find(":") + 2 :]
+            label = (fl + "::" + f).rstrip()
+        else:
+            print("nonconforming entry", s)
+            sys.exit(1)
+    return f, line, fl, label, False
+
+
+def main(prof):
+    funcs = {}
+    with open(prof) as fp:
+        raw = fp.readlines()
+    if raw[0].rstrip() != "Regions:":
+        print(
+            prof
+            + " does not appear to be a profile from the Kokkos simple kernel timer"
+        )
+        print(raw[0])
+        sys.exit(1)
+    cline = 2
+    in_regions = True
+    # process regions/kernels
+    while raw[cline].rstrip() != "":
+        sraw = raw[cline][2::]
+        f, line, fl, label, skip = parse_name(sraw)
+        if skip:
+            cline += 2
+            continue
+        if label not in funcs.keys():
+            funcs[label] = func(label)
+        funcs[label].add_region(fl, line, raw[cline + 1].rstrip())
+        cline += 2
+        if raw[cline].rstrip() == "" and in_regions:
+            cline += 4
+            in_regions = False
+
+    for key in funcs.keys():
+        funcs[key].compute_total_cost()
+
+    for key in sorted(funcs, key=lambda name: funcs[name].total_time, reverse=True):
+        funcs[key].print_region()
+        print()
+
+
+if __name__ == "__main__":
+    main(sys.argv[1])
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index aaeabd8a7dd7..26f33f32b4ef 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -131,6 +131,7 @@ add_library(parthenon
   interface/mesh_data.hpp
   interface/meshblock_data.cpp
   interface/meshblock_data.hpp
+  interface/swarm_comms.cpp
   interface/swarm_container.cpp
   interface/swarm.cpp
   interface/swarm.hpp
@@ -183,6 +184,10 @@ add_library(parthenon
   outputs/outputs.cpp
   outputs/outputs.hpp
   outputs/parthenon_hdf5.cpp
+  outputs/parthenon_hdf5_attributes.cpp
+  outputs/parthenon_hdf5_attributes_read.cpp
+  outputs/parthenon_hdf5_attributes_write.cpp
+  outputs/parthenon_hdf5_types.hpp
   outputs/parthenon_xdmf.cpp
   outputs/parthenon_hdf5.hpp
   outputs/parthenon_xdmf.hpp
@@ -213,10 +218,8 @@ add_library(parthenon
   solvers/mg_solver.hpp
   solvers/solver_utils.hpp
 
-  tasks/task_id.cpp
-  tasks/task_id.hpp
-  tasks/task_list.hpp
-  tasks/task_types.hpp
+  tasks/tasks.hpp
+  tasks/thread_pool.hpp
 
   time_integration/butcher_integrator.cpp
   time_integration/low_storage_integrator.cpp
@@ -233,10 +236,13 @@ add_library(parthenon
   utils/communication_buffer.hpp
   utils/cleantypes.hpp
   utils/concepts_lite.hpp
-  utils/error_checking.hpp
   utils/error_checking.cpp
+  utils/error_checking.hpp
   utils/hash.hpp
+  utils/index_split.cpp
+  utils/index_split.hpp
   utils/indexer.hpp
+  utils/instrument.hpp
   utils/loop_utils.hpp
   utils/morton_number.hpp
   utils/mpi_types.hpp
@@ -263,6 +269,8 @@ add_library(parthenon
   parameter_input.cpp
   parameter_input.hpp
   parthenon_array_generic.hpp
+  parthenon_arrays.cpp
+  parthenon_arrays.hpp
   parthenon_manager.cpp
   parthenon_manager.hpp
   parthenon_mpi.hpp
@@ -309,7 +317,7 @@ if (Kokkos_ENABLE_CUDA AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
    target_compile_options(parthenon PUBLIC --expt-relaxed-constexpr)
 endif()
 
-target_link_libraries(parthenon PUBLIC Kokkos::kokkos)
+target_link_libraries(parthenon PUBLIC Kokkos::kokkos Threads::Threads)
 
 if (PARTHENON_ENABLE_ASCENT)
   if (ENABLE_MPI)
diff --git a/src/amr_criteria/refinement_package.cpp b/src/amr_criteria/refinement_package.cpp
index a1f4c4eb560b..459877767d1d 100644
--- a/src/amr_criteria/refinement_package.cpp
+++ b/src/amr_criteria/refinement_package.cpp
@@ -59,6 +59,7 @@ AmrTag CheckAllRefinement(MeshBlockData<Real> *rc) {
   //    2) the code must maintain proper nesting, which sometimes means a block that is
   //       tagged as "derefine" must be left alone (or possibly refined?) because of
   //       neighboring blocks.  Similarly for "do nothing"
+  PARTHENON_INSTRUMENT
   MeshBlock *pmb = rc->GetBlockPointer();
   // delta_level holds the max over all criteria.  default to derefining.
   AmrTag delta_level = AmrTag::derefine;
@@ -90,11 +91,12 @@ AmrTag CheckAllRefinement(MeshBlockData<Real> *rc) {
 
 AmrTag FirstDerivative(const AMRBounds &bnds, const ParArray3D<Real> &q,
                        const Real refine_criteria, const Real derefine_criteria) {
+  PARTHENON_INSTRUMENT
   const int ndim = 1 + (bnds.je > bnds.js) + (bnds.ke > bnds.ks);
   Real maxd = 0.0;
   par_reduce(
-      loop_pattern_mdrange_tag, "refinement first derivative", DevExecSpace(), bnds.ks,
-      bnds.ke, bnds.js, bnds.je, bnds.is, bnds.ie,
+      loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), bnds.ks, bnds.ke,
+      bnds.js, bnds.je, bnds.is, bnds.ie,
       KOKKOS_LAMBDA(int k, int j, int i, Real &maxd) {
         Real scale = std::abs(q(k, j, i));
         Real d =
@@ -118,11 +120,12 @@ AmrTag FirstDerivative(const AMRBounds &bnds, const ParArray3D<Real> &q,
 
 AmrTag SecondDerivative(const AMRBounds &bnds, const ParArray3D<Real> &q,
                         const Real refine_criteria, const Real derefine_criteria) {
+  PARTHENON_INSTRUMENT
   const int ndim = 1 + (bnds.je > bnds.js) + (bnds.ke > bnds.ks);
   Real maxd = 0.0;
   par_reduce(
-      loop_pattern_mdrange_tag, "refinement second derivative", DevExecSpace(), bnds.ks,
-      bnds.ke, bnds.js, bnds.je, bnds.is, bnds.ie,
+      loop_pattern_mdrange_tag, PARTHENON_AUTO_LABEL, DevExecSpace(), bnds.ks, bnds.ke,
+      bnds.js, bnds.je, bnds.is, bnds.ie,
       KOKKOS_LAMBDA(int k, int j, int i, Real &maxd) {
         Real aqt = std::abs(q(k, j, i)) + TINY_NUMBER;
         Real qavg = 0.5 * (q(k, j, i + 1) + q(k, j, i - 1));
@@ -153,19 +156,17 @@ void SetRefinement_(MeshBlockData<Real> *rc) {
 
 template <>
 TaskStatus Tag(MeshBlockData<Real> *rc) {
-  Kokkos::Profiling::pushRegion("Task_Tag_Block");
+  PARTHENON_INSTRUMENT
   SetRefinement_(rc);
-  Kokkos::Profiling::popRegion(); // Task_Tag_Block
   return TaskStatus::complete;
 }
 
 template <>
 TaskStatus Tag(MeshData<Real> *rc) {
-  Kokkos::Profiling::pushRegion("Task_Tag_Mesh");
+  PARTHENON_INSTRUMENT
   for (int i = 0; i < rc->NumBlocks(); i++) {
     SetRefinement_(rc->GetBlockData(i).get());
   }
-  Kokkos::Profiling::popRegion(); // Task_Tag_Mesh
   return TaskStatus::complete;
 }
 
diff --git a/src/application_input.hpp b/src/application_input.hpp
index a9ec96c1551e..f66001ee472b 100644
--- a/src/application_input.hpp
+++ b/src/application_input.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -36,12 +36,17 @@ struct ApplicationInput {
   std::function<void(Mesh *, ParameterInput *)> InitUserMeshData = nullptr;
   std::function<void(Mesh *, ParameterInput *, MeshData<Real> *)> MeshProblemGenerator =
       nullptr;
+  std::function<void(Mesh *, ParameterInput *, MeshData<Real> *)> MeshPostInitialization =
+      nullptr;
 
   std::function<void(Mesh *, ParameterInput *, SimTime &)> PreStepMeshUserWorkInLoop =
       nullptr;
   std::function<void(Mesh *, ParameterInput *, SimTime const &)>
       PostStepMeshUserWorkInLoop = nullptr;
 
+  std::function<void(Mesh *, ParameterInput *, SimTime const &)>
+      UserMeshWorkBeforeOutput = nullptr;
+
   std::function<void(Mesh *, ParameterInput *, SimTime const &)>
       PreStepDiagnosticsInLoop = nullptr;
   std::function<void(Mesh *, ParameterInput *, SimTime const &)>
@@ -57,6 +62,7 @@ struct ApplicationInput {
       InitApplicationMeshBlockData = nullptr;
   std::function<void(MeshBlock *, ParameterInput *)> InitMeshBlockUserData = nullptr;
   std::function<void(MeshBlock *, ParameterInput *)> ProblemGenerator = nullptr;
+  std::function<void(MeshBlock *, ParameterInput *)> PostInitialization = nullptr;
   std::function<void(MeshBlock *, ParameterInput *)> MeshBlockUserWorkBeforeOutput =
       nullptr;
 };
diff --git a/src/basic_types.hpp b/src/basic_types.hpp
index 9898140f82b4..b36383aa2827 100644
--- a/src/basic_types.hpp
+++ b/src/basic_types.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2021-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2021-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -45,11 +45,11 @@ using Real = double;
 // X3DIR z, phi, etc...
 enum CoordinateDirection { NODIR = -1, X0DIR = 0, X1DIR = 1, X2DIR = 2, X3DIR = 3 };
 enum class BlockLocation { Left = 0, Center = 1, Right = 2 };
-enum class TaskStatus { fail, complete, incomplete, iterate, skip, waiting };
+enum class TaskStatus { complete, incomplete, iterate };
 
 enum class AmrTag : int { derefine = -1, same = 0, refine = 1 };
 enum class RefinementOp_t { Prolongation, Restriction, None };
-
+enum class CellLevel : int { coarse = -1, same = 0, fine = 1 };
 // JMM: Not clear this is the best place for this but it minimizes
 // circular dependency nonsense.
 constexpr int NUM_BNDRY_TYPES = 10;
@@ -152,16 +152,17 @@ inline std::vector<TopologicalElement> GetTopologicalElements(TopologicalType tt
   if (tt == TT::Face) return {TE::F1, TE::F2, TE::F3};
   return {TE::CC};
 }
+
 using TE = TopologicalElement;
 // Returns one if the I coordinate of el is offset from the zone center coordinates,
 // and zero otherwise
-KOKKOS_INLINE_FUNCTION int TopologicalOffsetI(TE el) noexcept {
+inline constexpr int TopologicalOffsetI(TE el) {
   return (el == TE::F1 || el == TE::E2 || el == TE::E3 || el == TE::NN);
 }
-KOKKOS_INLINE_FUNCTION int TopologicalOffsetJ(TE el) noexcept {
+inline constexpr int TopologicalOffsetJ(TE el) {
   return (el == TE::F2 || el == TE::E3 || el == TE::E1 || el == TE::NN);
 }
-KOKKOS_INLINE_FUNCTION int TopologicalOffsetK(TE el) noexcept {
+inline constexpr int TopologicalOffsetK(TE el) {
   return (el == TE::F3 || el == TE::E2 || el == TE::E1 || el == TE::NN);
 }
 
diff --git a/src/bvals/boundary_conditions.cpp b/src/bvals/boundary_conditions.cpp
index be1924ab3e3e..2bc5eb68a23e 100644
--- a/src/bvals/boundary_conditions.cpp
+++ b/src/bvals/boundary_conditions.cpp
@@ -33,7 +33,7 @@ bool DoPhysicalBoundary_(const BoundaryFlag flag, const BoundaryFace face,
 
 TaskStatus ApplyBoundaryConditionsOnCoarseOrFine(std::shared_ptr<MeshBlockData<Real>> &rc,
                                                  bool coarse) {
-  Kokkos::Profiling::pushRegion("Task_ApplyBoundaryConditionsOnCoarseOrFine");
+  PARTHENON_INSTRUMENT
   using namespace boundary_cond_impl;
   MeshBlock *pmb = rc->GetBlockPointer();
   Mesh *pmesh = pmb->pmy_mesh;
@@ -50,7 +50,6 @@ TaskStatus ApplyBoundaryConditionsOnCoarseOrFine(std::shared_ptr<MeshBlockData<R
     }
   }
 
-  Kokkos::Profiling::popRegion(); // Task_ApplyBoundaryConditionsOnCoarseOrFine
   return TaskStatus::complete;
 }
 
diff --git a/src/bvals/boundary_conditions_generic.hpp b/src/bvals/boundary_conditions_generic.hpp
index b0f331aeefcd..57efdb901105 100644
--- a/src/bvals/boundary_conditions_generic.hpp
+++ b/src/bvals/boundary_conditions_generic.hpp
@@ -112,7 +112,7 @@ void GenericBC(std::shared_ptr<MeshBlockData<Real>> &rc, bool coarse,
   const int offsetin = INNER;
   const int offsetout = !INNER;
   pmb->par_for_bndry(
-      label, nb, domain, el, coarse,
+      PARTHENON_AUTO_LABEL, nb, domain, el, coarse,
       KOKKOS_LAMBDA(const int &l, const int &k, const int &j, const int &i) {
         if (TYPE == BCType::Reflect) {
           const bool reflect = (q(b, el, l).vector_component == DIR);
diff --git a/src/bvals/bvals_base.cpp b/src/bvals/bvals_base.cpp
index 7c43c4917197..de8e74519234 100644
--- a/src/bvals/bvals_base.cpp
+++ b/src/bvals/bvals_base.cpp
@@ -337,7 +337,7 @@ int BoundaryBase::CreateBvalsMPITag(int lid, int bufid) {
 void BoundaryBase::SearchAndSetNeighbors(
     Mesh *mesh, MeshBlockTree &tree, int *ranklist, int *nslist,
     const std::unordered_set<LogicalLocation> &newly_refined) {
-  Kokkos::Profiling::pushRegion("SearchAndSetNeighbors");
+  PARTHENON_INSTRUMENT
   MeshBlockTree *neibt;
   int myox1, myox2 = 0, myox3 = 0, myfx1, myfx2, myfx3;
   myfx1 = ((loc.lx1() & 1LL) == 1LL);
@@ -404,7 +404,6 @@ void BoundaryBase::SearchAndSetNeighbors(
   }
   if (block_size_.nx(X2DIR) == 1) {
     SetNeighborOwnership(newly_refined);
-    Kokkos::Profiling::popRegion(); // SearchAndSetNeighbors
     return;
   }
 
@@ -539,7 +538,6 @@ void BoundaryBase::SearchAndSetNeighbors(
 
   if (block_size_.nx(X3DIR) == 1) {
     SetNeighborOwnership(newly_refined);
-    Kokkos::Profiling::popRegion(); // SearchAndSetNeighbors
     return;
   }
 
@@ -662,7 +660,6 @@ void BoundaryBase::SearchAndSetNeighbors(
   }
 
   SetNeighborOwnership(newly_refined);
-  Kokkos::Profiling::popRegion(); // SearchAndSetNeighbors
 }
 
 void BoundaryBase::SetNeighborOwnership(
diff --git a/src/bvals/comms/bnd_info.cpp b/src/bvals/comms/bnd_info.cpp
index 3eaeb16091e9..e2013354a398 100644
--- a/src/bvals/comms/bnd_info.cpp
+++ b/src/bvals/comms/bnd_info.cpp
@@ -226,6 +226,7 @@ BndInfo BndInfo::GetSendBndInfo(MeshBlock *pmb, const NeighborBlock &nb,
   BndInfo out;
 
   out.allocated = v->IsAllocated();
+  out.alloc_status = v->GetAllocationStatus();
   if (!out.allocated) return out;
 
   out.buf = buf->buffer();
@@ -233,7 +234,6 @@ BndInfo BndInfo::GetSendBndInfo(MeshBlock *pmb, const NeighborBlock &nb,
   int Nv = v->GetDim(4);
   int Nu = v->GetDim(5);
   int Nt = v->GetDim(6);
-
   int mylevel = pmb->loc.level();
 
   auto elements = v->GetTopologicalElements();
@@ -267,6 +267,7 @@ BndInfo BndInfo::GetSetBndInfo(MeshBlock *pmb, const NeighborBlock &nb,
     PARTHENON_FAIL("Buffer should be in a received state.");
   }
   out.allocated = v->IsAllocated();
+  out.alloc_status = v->GetAllocationStatus();
 
   int Nv = v->GetDim(4);
   int Nu = v->GetDim(5);
@@ -297,6 +298,7 @@ ProResInfo ProResInfo::GetInteriorRestrict(MeshBlock *pmb, const NeighborBlock &
   ProResInfo out;
 
   out.allocated = v->IsAllocated();
+  out.alloc_status = v->GetAllocationStatus();
   if (!out.allocated) return out;
 
   int Nv = v->GetDim(4);
@@ -328,6 +330,7 @@ ProResInfo ProResInfo::GetInteriorProlongate(MeshBlock *pmb, const NeighborBlock
   ProResInfo out;
 
   out.allocated = v->IsAllocated();
+  out.alloc_status = v->GetAllocationStatus();
   if (!out.allocated) return out;
 
   int Nv = v->GetDim(4);
@@ -358,6 +361,7 @@ ProResInfo ProResInfo::GetSend(MeshBlock *pmb, const NeighborBlock &nb,
   ProResInfo out;
 
   out.allocated = v->IsAllocated();
+  out.alloc_status = v->GetAllocationStatus();
   if (!out.allocated) return out;
 
   int Nv = v->GetDim(4);
@@ -388,6 +392,7 @@ ProResInfo ProResInfo::GetSet(MeshBlock *pmb, const NeighborBlock &nb,
                               std::shared_ptr<Variable<Real>> v) {
   ProResInfo out;
   out.allocated = v->IsAllocated();
+  out.alloc_status = v->GetAllocationStatus();
   int Nv = v->GetDim(4);
   int Nu = v->GetDim(5);
   int Nt = v->GetDim(6);
@@ -448,6 +453,7 @@ BndInfo BndInfo::GetSendCCFluxCor(MeshBlock *pmb, const NeighborBlock &nb,
                                   CommBuffer<buf_pool_t<Real>::owner_t> *buf) {
   BndInfo out;
   out.allocated = v->IsAllocated();
+  out.alloc_status = v->GetAllocationStatus();
   if (!v->IsAllocated()) {
     // Not going to actually do anything with this buffer
     return out;
@@ -507,9 +513,11 @@ BndInfo BndInfo::GetSetCCFluxCor(MeshBlock *pmb, const NeighborBlock &nb,
 
   if (!v->IsAllocated() || buf->GetState() != BufferState::received) {
     out.allocated = false;
+    out.alloc_status = v->GetAllocationStatus();
     return out;
   }
   out.allocated = true;
+  out.alloc_status = v->GetAllocationStatus();
   out.buf = buf->buffer();
 
   IndexRange ib = pmb->cellbounds.GetBoundsI(IndexDomain::interior);
diff --git a/src/bvals/comms/bnd_info.hpp b/src/bvals/comms/bnd_info.hpp
index 4cc03b0c13a9..e7a265272c86 100644
--- a/src/bvals/comms/bnd_info.hpp
+++ b/src/bvals/comms/bnd_info.hpp
@@ -47,6 +47,7 @@ struct BndInfo {
   CoordinateDirection dir;
   bool allocated = true;
   bool buf_allocated = true;
+  int alloc_status;
 
   buf_pool_t<Real>::weak_t buf;        // comm buffer from pool
   ParArrayND<Real, VariableState> var; // data variable used for comms
@@ -79,6 +80,7 @@ struct ProResInfo {
 
   CoordinateDirection dir;
   bool allocated = true;
+  int alloc_status;
   RefinementOp_t refinement_op = RefinementOp_t::None;
   Coordinates_t coords, coarse_coords; // coords
 
diff --git a/src/bvals/comms/boundary_communication.cpp b/src/bvals/comms/boundary_communication.cpp
index 8e97e9915c74..cc093a34f080 100644
--- a/src/bvals/comms/boundary_communication.cpp
+++ b/src/bvals/comms/boundary_communication.cpp
@@ -34,8 +34,8 @@
 #include "mesh/mesh_refinement.hpp"
 #include "mesh/meshblock.hpp"
 #include "prolong_restrict/prolong_restrict.hpp"
-#include "tasks/task_id.hpp"
-#include "tasks/task_list.hpp"
+
+#include "tasks/tasks.hpp"
 #include "utils/error_checking.hpp"
 #include "utils/loop_utils.hpp"
 
@@ -46,7 +46,7 @@ using namespace loops::shorthands;
 
 template <BoundaryType bound_type>
 TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_LoadAndSendBoundBufs");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, true);
@@ -59,11 +59,9 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
       CheckSendBufferCacheForRebuild<bound_type, true>(md);
 
   if (nbound == 0) {
-    Kokkos::Profiling::popRegion(); // Task_LoadAndSendBoundBufs
     return TaskStatus::complete;
   }
   if (other_communication_unfinished) {
-    Kokkos::Profiling::popRegion(); // Task_LoadAndSendBoundBufs
     return TaskStatus::incomplete;
   }
 
@@ -92,7 +90,7 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
   auto &sending_nonzero_flags_h = cache.sending_non_zero_flags_h;
 
   Kokkos::parallel_for(
-      "SendBoundBufs",
+      PARTHENON_AUTO_LABEL,
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         const int b = team_member.league_rank();
@@ -152,7 +150,6 @@ TaskStatus SendBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
       buf.SendNull();
   }
 
-  Kokkos::Profiling::popRegion(); // Task_LoadAndSendBoundBufs
   return TaskStatus::complete;
 }
 
@@ -167,7 +164,7 @@ SendBoundBufs<BoundaryType::gmg_prolongate_send>(std::shared_ptr<MeshData<Real>>
 
 template <BoundaryType bound_type>
 TaskStatus StartReceiveBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_StartReceiveBoundBufs");
+  PARTHENON_INSTRUMENT
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
   if (cache.buf_vec.size() == 0)
@@ -177,7 +174,6 @@ TaskStatus StartReceiveBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
   std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec),
                 [](auto pbuf) { pbuf->TryStartReceive(); });
 
-  Kokkos::Profiling::popRegion(); // Task_StartReceiveBoundBufs
   return TaskStatus::complete;
 }
 
@@ -194,7 +190,7 @@ template TaskStatus StartReceiveBoundBufs<BoundaryType::gmg_prolongate_recv>(
 
 template <BoundaryType bound_type>
 TaskStatus ReceiveBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_ReceiveBoundBufs");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
@@ -225,7 +221,6 @@ TaskStatus ReceiveBoundBufs(std::shared_ptr<MeshData<Real>> &md) {
           ++ibound;
         });
   }
-  Kokkos::Profiling::popRegion(); // Task_ReceiveBoundBufs
   if (all_received) return TaskStatus::complete;
   return TaskStatus::incomplete;
 }
@@ -243,7 +238,7 @@ ReceiveBoundBufs<BoundaryType::gmg_prolongate_recv>(std::shared_ptr<MeshData<Rea
 
 template <BoundaryType bound_type>
 TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_SetInternalBoundaries");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
@@ -265,7 +260,7 @@ TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
   // const Real threshold = Globals::sparse_config.allocation_threshold;
   auto &bnd_info = cache.bnd_info;
   Kokkos::parallel_for(
-      "SetBoundaryBuffers",
+      PARTHENON_AUTO_LABEL,
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         const int b = team_member.league_rank();
@@ -323,7 +318,6 @@ TaskStatus SetBounds(std::shared_ptr<MeshData<Real>> &md) {
     refinement::Restrict(resolved_packages, cache.prores_cache, pmb->cellbounds,
                          pmb->c_cellbounds);
   }
-  Kokkos::Profiling::popRegion(); // Task_SetInternalBoundaries
   return TaskStatus::complete;
 }
 
@@ -337,7 +331,7 @@ SetBounds<BoundaryType::gmg_prolongate_recv>(std::shared_ptr<MeshData<Real>> &);
 
 template <BoundaryType bound_type>
 TaskStatus ProlongateBounds(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_ProlongateBoundaries");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(bound_type, false);
@@ -367,7 +361,6 @@ TaskStatus ProlongateBounds(std::shared_ptr<MeshData<Real>> &md) {
     refinement::ProlongateInternal(resolved_packages, cache.prores_cache, pmb->cellbounds,
                                    pmb->c_cellbounds);
   }
-  Kokkos::Profiling::popRegion(); // Task_ProlongateBoundaries
   return TaskStatus::complete;
 }
 
@@ -381,8 +374,8 @@ template TaskStatus
 ProlongateBounds<BoundaryType::gmg_prolongate_recv>(std::shared_ptr<MeshData<Real>> &);
 
 // Adds all relevant boundary communication to a single task list
-template <BoundaryType bounds, class TL_t>
-TaskID AddBoundaryExchangeTasks(TaskID dependency, TL_t &tl,
+template <BoundaryType bounds>
+TaskID AddBoundaryExchangeTasks(TaskID dependency, TaskList &tl,
                                 std::shared_ptr<MeshData<Real>> &md, bool multilevel) {
   // TODO(LFR): Splitting up the boundary tasks while doing prolongation can cause some
   //            possible issues for sparse fields. In particular, the order in which
@@ -422,13 +415,11 @@ TaskID AddBoundaryExchangeTasks(TaskID dependency, TL_t &tl,
 
   return fbound;
 }
-template TaskID AddBoundaryExchangeTasks<BoundaryType::any, TaskList>(
-    TaskID, TaskList &, std::shared_ptr<MeshData<Real>> &, bool);
-template TaskID AddBoundaryExchangeTasks<BoundaryType::any, IterativeTasks>(
-    TaskID, IterativeTasks &, std::shared_ptr<MeshData<Real>> &, bool);
-
-template TaskID AddBoundaryExchangeTasks<BoundaryType::gmg_same, TaskList>(
-    TaskID, TaskList &, std::shared_ptr<MeshData<Real>> &, bool);
-template TaskID AddBoundaryExchangeTasks<BoundaryType::gmg_same, IterativeTasks>(
-    TaskID, IterativeTasks &, std::shared_ptr<MeshData<Real>> &, bool);
+template TaskID
+AddBoundaryExchangeTasks<BoundaryType::any>(TaskID, TaskList &,
+                                            std::shared_ptr<MeshData<Real>> &, bool);
+
+template TaskID
+AddBoundaryExchangeTasks<BoundaryType::gmg_same>(TaskID, TaskList &,
+                                                 std::shared_ptr<MeshData<Real>> &, bool);
 } // namespace parthenon
diff --git a/src/bvals/comms/build_boundary_buffers.cpp b/src/bvals/comms/build_boundary_buffers.cpp
index 9058877cac23..389ace24a132 100644
--- a/src/bvals/comms/build_boundary_buffers.cpp
+++ b/src/bvals/comms/build_boundary_buffers.cpp
@@ -113,7 +113,7 @@ void BuildBoundaryBufferSubset(std::shared_ptr<MeshData<Real>> &md,
 // pmesh->boundary_comm_map.clear() after every remesh
 // in InitializeBlockTimeStepsAndBoundaries()
 TaskStatus BuildBoundaryBuffers(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_BuildSendBoundBufs");
+  PARTHENON_INSTRUMENT
   Mesh *pmesh = md->GetMeshPointer();
   auto &all_caches = md->GetBvarsCache();
 
@@ -127,7 +127,6 @@ TaskStatus BuildBoundaryBuffers(std::shared_ptr<MeshData<Real>> &md) {
   BuildBoundaryBufferSubset<BoundaryType::flxcor_recv>(md,
                                                        pmesh->boundary_comm_flxcor_map);
 
-  Kokkos::Profiling::popRegion(); // "Task_BuildSendBoundBufs"
   return TaskStatus::complete;
 }
 
diff --git a/src/bvals/comms/bvals_in_one.hpp b/src/bvals/comms/bvals_in_one.hpp
index dbc48efd821b..22b637a48a74 100644
--- a/src/bvals/comms/bvals_in_one.hpp
+++ b/src/bvals/comms/bvals_in_one.hpp
@@ -25,8 +25,8 @@
 #include "basic_types.hpp"
 #include "bvals/bvals_interfaces.hpp"
 #include "coordinates/coordinates.hpp"
-#include "tasks/task_id.hpp"
-#include "tasks/task_list.hpp"
+
+#include "tasks/tasks.hpp"
 #include "utils/object_pool.hpp"
 
 namespace parthenon {
@@ -72,8 +72,8 @@ TaskStatus ReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md);
 TaskStatus SetFluxCorrections(std::shared_ptr<MeshData<Real>> &md);
 
 // Adds all relevant boundary communication to a single task list
-template <BoundaryType bounds = BoundaryType::any, class TL_t>
-TaskID AddBoundaryExchangeTasks(TaskID dependency, TL_t &tl,
+template <BoundaryType bounds = BoundaryType::any>
+TaskID AddBoundaryExchangeTasks(TaskID dependency, TaskList &tl,
                                 std::shared_ptr<MeshData<Real>> &md, bool multilevel);
 
 // These tasks should not be called in down stream code
diff --git a/src/bvals/comms/bvals_utils.hpp b/src/bvals/comms/bvals_utils.hpp
index 8683027e1c4f..bdb5e95dc4c2 100644
--- a/src/bvals/comms/bvals_utils.hpp
+++ b/src/bvals/comms/bvals_utils.hpp
@@ -139,7 +139,7 @@ inline auto CheckSendBufferCacheForRebuild(std::shared_ptr<MeshData<Real>> md) {
     }
 
     if (ibuf < cache.bnd_info_h.size()) {
-      if (cache.bnd_info_h(ibuf).allocated != v->IsAllocated()) rebuild = true;
+      if (cache.bnd_info_h(ibuf).alloc_status != v->GetAllocationStatus()) rebuild = true;
       rebuild = rebuild || !UsingSameResource(cache.bnd_info_h(ibuf).buf, buf.buffer());
     } else {
       rebuild = true;
@@ -162,7 +162,7 @@ inline auto CheckReceiveBufferCacheForRebuild(std::shared_ptr<MeshData<Real>> md
     const std::size_t ibuf = cache.idx_vec[nbound];
     auto &buf = *cache.buf_vec[ibuf];
     if (ibuf < cache.bnd_info_h.size()) {
-      if (cache.bnd_info_h(ibuf).allocated != v->IsAllocated()) rebuild = true;
+      if (cache.bnd_info_h(ibuf).alloc_status != v->GetAllocationStatus()) rebuild = true;
       rebuild = rebuild || !UsingSameResource(cache.bnd_info_h(ibuf).buf, buf.buffer());
 
       if ((buf.GetState() == BufferState::received) &&
diff --git a/src/bvals/comms/flux_correction.cpp b/src/bvals/comms/flux_correction.cpp
index 5a82a15429e6..ff76bcba0014 100644
--- a/src/bvals/comms/flux_correction.cpp
+++ b/src/bvals/comms/flux_correction.cpp
@@ -39,7 +39,7 @@ namespace parthenon {
 using namespace impl;
 
 TaskStatus LoadAndSendFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_LoadAndSendFluxCorrections");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_send, true);
@@ -53,12 +53,10 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
       CheckSendBufferCacheForRebuild<BoundaryType::flxcor_send, true>(md);
 
   if (nbound == 0) {
-    Kokkos::Profiling::popRegion(); // Task_LoadAndSendFluxCorrections
     return TaskStatus::complete;
   }
 
   if (other_communication_unfinished) {
-    Kokkos::Profiling::popRegion(); // Task_LoadAndSendFluxCorrections
     return TaskStatus::incomplete;
   }
 
@@ -69,7 +67,7 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   auto &bnd_info = cache.bnd_info;
   PARTHENON_REQUIRE(bnd_info.size() == nbound, "Need same size for boundary info");
   Kokkos::parallel_for(
-      "SendFluxCorrectionBufs",
+      PARTHENON_AUTO_LABEL,
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         auto &binfo = bnd_info(team_member.league_rank());
@@ -116,12 +114,11 @@ TaskStatus LoadAndSendFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   // Calling Send will send null if the underlying buffer is unallocated
   for (auto &buf : cache.buf_vec)
     buf->Send();
-  Kokkos::Profiling::popRegion(); // Task_LoadAndSendFluxCorrections
   return TaskStatus::complete;
 }
 
 TaskStatus StartReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_StartReceiveFluxCorrections");
+  PARTHENON_INSTRUMENT
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false);
   if (cache.buf_vec.size() == 0)
@@ -131,12 +128,11 @@ TaskStatus StartReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec),
                 [](auto pbuf) { pbuf->TryStartReceive(); });
 
-  Kokkos::Profiling::popRegion(); // Task_StartReceiveFluxCorrections
   return TaskStatus::complete;
 }
 
 TaskStatus ReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_ReceiveFluxCorrections");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false);
@@ -149,14 +145,12 @@ TaskStatus ReceiveFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
       std::begin(cache.buf_vec), std::end(cache.buf_vec),
       [&all_received](auto pbuf) { all_received = pbuf->TryReceive() && all_received; });
 
-  Kokkos::Profiling::popRegion(); // Task_ReceiveFluxCorrections
-
   if (all_received) return TaskStatus::complete;
   return TaskStatus::incomplete;
 }
 
 TaskStatus SetFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
-  Kokkos::Profiling::pushRegion("Task_SetFluxCorrections");
+  PARTHENON_INSTRUMENT
 
   Mesh *pmesh = md->GetMeshPointer();
   auto &cache = md->GetBvarsCache().GetSubCache(BoundaryType::flxcor_recv, false);
@@ -169,7 +163,7 @@ TaskStatus SetFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
 
   auto &bnd_info = cache.bnd_info;
   Kokkos::parallel_for(
-      "SetFluxCorBuffers",
+      PARTHENON_AUTO_LABEL,
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), nbound, Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         const int b = team_member.league_rank();
@@ -188,7 +182,6 @@ TaskStatus SetFluxCorrections(std::shared_ptr<MeshData<Real>> &md) {
   std::for_each(std::begin(cache.buf_vec), std::end(cache.buf_vec),
                 [](auto pbuf) { pbuf->Stale(); });
 
-  Kokkos::Profiling::popRegion(); // Task_SetFluxCorrections
   return TaskStatus::complete;
 }
 
diff --git a/src/bvals/comms/tag_map.cpp b/src/bvals/comms/tag_map.cpp
index da562c29a2f9..0136f35e9a0e 100644
--- a/src/bvals/comms/tag_map.cpp
+++ b/src/bvals/comms/tag_map.cpp
@@ -57,13 +57,23 @@ template void TagMap::AddMeshDataToMap<BoundaryType::gmg_restrict_recv>(
     std::shared_ptr<MeshData<Real>> &md);
 
 void TagMap::ResolveMap() {
+#ifdef MPI_PARALLEL
+  int flag;
+  void *max_tag; // largest supported MPI tag value
+  PARTHENON_MPI_CHECK(MPI_Comm_get_attr(MPI_COMM_WORLD, MPI_TAG_UB, &max_tag, &flag));
+  if (!flag) {
+    PARTHENON_FAIL("MPI error, cannot query largest supported MPI tag value.");
+  }
+#endif
   for (auto it = map_.begin(); it != map_.end(); ++it) {
     auto &pair_map = it->second;
     int idx = 0;
     std::for_each(pair_map.begin(), pair_map.end(),
                   [&idx](auto &pair) { pair.second = idx++; });
-    if (idx > 32767)
-      PARTHENON_FAIL("Number of tags exceeds the maximum allowed by the MPI standard.");
+#ifdef MPI_PARALLEL
+    if (idx > (*reinterpret_cast<int *>(max_tag)) && it->first != Globals::my_rank)
+      PARTHENON_FAIL("Number of tags exceeds the maximum allowed by this MPI version.");
+#endif
   }
 }
 
diff --git a/src/coordinates/uniform_cartesian.hpp b/src/coordinates/uniform_cartesian.hpp
index b10cbe0ad06c..51b06467fae3 100644
--- a/src/coordinates/uniform_cartesian.hpp
+++ b/src/coordinates/uniform_cartesian.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2023-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -164,18 +164,11 @@ class UniformCartesian {
 
   template <int dir, TopologicalElement el>
   KOKKOS_FORCEINLINE_FUNCTION Real X(const int idx) const {
-    using TE = TopologicalElement;
-    [[maybe_unused]] bool constexpr X1EDGE =
-        el == TE::F1 || el == TE::E2 || el == TE::E3 || el == TE::NN;
-    [[maybe_unused]] bool constexpr X2EDGE =
-        el == TE::F2 || el == TE::E3 || el == TE::E1 || el == TE::NN;
-    [[maybe_unused]] bool constexpr X3EDGE =
-        el == TE::F3 || el == TE::E1 || el == TE::E2 || el == TE::NN;
-    if constexpr (dir == X1DIR && X1EDGE) {
+    if constexpr (dir == X1DIR && TopologicalOffsetI(el)) {
       return xmin_[dir - 1] + idx * dx_[dir - 1]; // idx - 1/2
-    } else if constexpr (dir == X2DIR && X2EDGE) {
+    } else if constexpr (dir == X2DIR && TopologicalOffsetJ(el)) {
       return xmin_[dir - 1] + idx * dx_[dir - 1]; // idx - 1/2
-    } else if constexpr (dir == X3DIR && X3EDGE) {
+    } else if constexpr (dir == X3DIR && TopologicalOffsetK(el)) {
       return xmin_[dir - 1] + idx * dx_[dir - 1]; // idx - 1/2
     } else {
       return xmin_[dir - 1] + (idx + 0.5) * dx_[dir - 1]; // idx
diff --git a/src/defs.hpp b/src/defs.hpp
index ace72628f035..47e8c28b6b78 100644
--- a/src/defs.hpp
+++ b/src/defs.hpp
@@ -107,7 +107,7 @@ struct RegionSize {
 // TODO(felker): C++ Core Guidelines Enum.5: Don’t use ALL_CAPS for enumerators
 // (avoid clashes with preprocessor macros). Enumerated type definitions in this file and:
 // io_wrapper.hpp, bvals.hpp, field_diffusion.hpp,
-// task_list.hpp, ???
+// tasks.hpp, ???
 
 // identifiers for all 6 faces of a MeshBlock
 constexpr int BOUNDARY_NFACES = 6;
diff --git a/src/driver/driver.cpp b/src/driver/driver.cpp
index 1af398ee5663..f384a5503eda 100644
--- a/src/driver/driver.cpp
+++ b/src/driver/driver.cpp
@@ -67,16 +67,17 @@ DriverStatus EvolutionDriver::Execute() {
   SetGlobalTimeStep();
 
   // Before loop do work
-  // App input version
-  Kokkos::Profiling::pushRegion("Driver_UserWorkBeforeLoop");
-  if (app_input->UserWorkBeforeLoop != nullptr) {
-    app_input->UserWorkBeforeLoop(pmesh, pinput, tm);
-  }
-  // packages version
-  for (auto &[name, pkg] : pmesh->packages.AllPackages()) {
-    pkg->UserWorkBeforeLoop(pmesh, pinput, tm);
-  }
-  Kokkos::Profiling::popRegion(); // Driver_UserWorkBeforeLoop
+  { // UserWorkBeforeLoop
+    PARTHENON_INSTRUMENT
+    // App input version
+    if (app_input->UserWorkBeforeLoop != nullptr) {
+      app_input->UserWorkBeforeLoop(pmesh, pinput, tm);
+    }
+    // packages version
+    for (auto &[name, pkg] : pmesh->packages.AllPackages()) {
+      pkg->UserWorkBeforeLoop(pmesh, pinput, tm);
+    }
+  } // UserWorkBeforeLoop
 
   OutputSignal signal = OutputSignal::none;
   pouts->MakeOutputs(pmesh, pinput, &tm, signal);
@@ -88,51 +89,54 @@ DriverStatus EvolutionDriver::Execute() {
   // Defaults must be set across all ranks
   DumpInputParameters();
 
-  Kokkos::Profiling::pushRegion("Driver_Main");
-  while (tm.KeepGoing()) {
-    if (Globals::my_rank == 0) OutputCycleDiagnostics();
+  { // Main t < tmax loop region
+    PARTHENON_INSTRUMENT
+    while (tm.KeepGoing()) {
+      if (Globals::my_rank == 0) OutputCycleDiagnostics();
 
-    pmesh->PreStepUserWorkInLoop(pmesh, pinput, tm);
-    pmesh->PreStepUserDiagnosticsInLoop(pmesh, pinput, tm);
+      pmesh->PreStepUserWorkInLoop(pmesh, pinput, tm);
+      pmesh->PreStepUserDiagnosticsInLoop(pmesh, pinput, tm);
 
-    TaskListStatus status = Step();
-    if (status != TaskListStatus::complete) {
-      std::cerr << "Step failed to complete all tasks." << std::endl;
-      return DriverStatus::failed;
-    }
+      TaskListStatus status = Step();
+      if (status != TaskListStatus::complete) {
+        std::cerr << "Step failed to complete all tasks." << std::endl;
+        return DriverStatus::failed;
+      }
 
-    pmesh->PostStepUserWorkInLoop(pmesh, pinput, tm);
-    pmesh->PostStepUserDiagnosticsInLoop(pmesh, pinput, tm);
+      pmesh->PostStepUserWorkInLoop(pmesh, pinput, tm);
+      pmesh->PostStepUserDiagnosticsInLoop(pmesh, pinput, tm);
 
-    tm.ncycle++;
-    tm.time += tm.dt;
-    pmesh->mbcnt += pmesh->nbtotal;
-    pmesh->step_since_lb++;
+      tm.ncycle++;
+      tm.time += tm.dt;
+      pmesh->mbcnt += pmesh->nbtotal;
+      pmesh->step_since_lb++;
 
-    timer_LBandAMR.reset();
-    pmesh->LoadBalancingAndAdaptiveMeshRefinement(pinput, app_input);
-    if (pmesh->modified) InitializeBlockTimeStepsAndBoundaries();
-    time_LBandAMR += timer_LBandAMR.seconds();
-    SetGlobalTimeStep();
+      timer_LBandAMR.reset();
+      pmesh->LoadBalancingAndAdaptiveMeshRefinement(pinput, app_input);
+      if (pmesh->modified) InitializeBlockTimeStepsAndBoundaries();
+      time_LBandAMR += timer_LBandAMR.seconds();
+      SetGlobalTimeStep();
 
-    // check for signals
-    signal = SignalHandler::CheckSignalFlags();
+      // check for signals
+      signal = SignalHandler::CheckSignalFlags();
 
-    if (signal == OutputSignal::final) {
-      break;
-    }
+      if (signal == OutputSignal::final) {
+        break;
+      }
 
-    // skip the final (last) output at the end of the simulation time as it happens later
-    if (tm.KeepGoing()) {
-      pouts->MakeOutputs(pmesh, pinput, &tm, signal);
-    }
+      // skip the final (last) output at the end of the simulation time as it happens
+      // later
+      if (tm.KeepGoing()) {
+        pouts->MakeOutputs(pmesh, pinput, &tm, signal);
+      }
 
-    if (tm.ncycle == perf_cycle_offset) {
-      pmesh->mbcnt = 0;
-      timer_main.reset();
-    }
-  } // END OF MAIN INTEGRATION LOOP ======================================================
-  Kokkos::Profiling::popRegion(); // Driver_Main
+      if (tm.ncycle == perf_cycle_offset) {
+        pmesh->mbcnt = 0;
+        timer_main.reset();
+      }
+    } // END OF MAIN INTEGRATION LOOP
+      // ======================================================
+  }   // Main t < tmax loop region
 
   pmesh->UserWorkAfterLoop(pmesh, pinput, tm);
 
diff --git a/src/driver/driver.hpp b/src/driver/driver.hpp
index 96937177312b..929ea19c3f2b 100644
--- a/src/driver/driver.hpp
+++ b/src/driver/driver.hpp
@@ -27,7 +27,7 @@
 #include "mesh/mesh.hpp"
 #include "outputs/outputs.hpp"
 #include "parameter_input.hpp"
-#include "tasks/task_list.hpp"
+#include "tasks/tasks.hpp"
 
 namespace parthenon {
 
diff --git a/src/driver/multistage.hpp b/src/driver/multistage.hpp
index 07748ae3564c..d23c894be592 100644
--- a/src/driver/multistage.hpp
+++ b/src/driver/multistage.hpp
@@ -22,7 +22,7 @@
 #include "driver/driver.hpp"
 #include "mesh/mesh.hpp"
 #include "parameter_input.hpp"
-#include "tasks/task_list.hpp"
+#include "tasks/tasks.hpp"
 #include "time_integration/staged_integrator.hpp"
 
 namespace parthenon {
@@ -37,7 +37,7 @@ class MultiStageDriverGeneric : public EvolutionDriver {
   // the dependencies that must be executed.
   virtual TaskCollection MakeTaskCollection(BlockList_t &blocks, int stage) = 0;
   virtual TaskListStatus Step() {
-    Kokkos::Profiling::pushRegion("MultiStage_Step");
+    PARTHENON_INSTRUMENT
     using DriverUtils::ConstructAndExecuteTaskLists;
     TaskListStatus status;
     integrator->dt = tm.dt;
@@ -49,7 +49,6 @@ class MultiStageDriverGeneric : public EvolutionDriver {
       status = ConstructAndExecuteTaskLists<>(this, stage);
       if (status != TaskListStatus::complete) break;
     }
-    Kokkos::Profiling::popRegion(); // MultiStage_Step
     return status;
   }
 
@@ -66,7 +65,7 @@ class MultiStageBlockTaskDriverGeneric : public MultiStageDriverGeneric<Integrat
       : MultiStageDriverGeneric<Integrator>(pin, app_in, pm) {}
   virtual TaskList MakeTaskList(MeshBlock *pmb, int stage) = 0;
   virtual TaskListStatus Step() {
-    Kokkos::Profiling::pushRegion("MultiStageBlockTask_Step");
+    PARTHENON_INSTRUMENT
     using DriverUtils::ConstructAndExecuteBlockTasks;
     TaskListStatus status;
     Integrator *integrator = (this->integrator).get();
@@ -76,7 +75,6 @@ class MultiStageBlockTaskDriverGeneric : public MultiStageDriverGeneric<Integrat
       status = ConstructAndExecuteBlockTasks<>(this, stage);
       if (status != TaskListStatus::complete) break;
     }
-    Kokkos::Profiling::popRegion(); // MultiStageBlockTask_Step
     return status;
   }
 
diff --git a/src/interface/mesh_data.cpp b/src/interface/mesh_data.cpp
index bb1a532ef6c8..a6bc08e02769 100644
--- a/src/interface/mesh_data.cpp
+++ b/src/interface/mesh_data.cpp
@@ -41,6 +41,26 @@ void MeshData<T>::Initialize(const MeshData<T> *src,
   }
 }
 
+template <typename T>
+void MeshData<T>::Set(BlockList_t blocks, Mesh *pmesh, int ndim) {
+  const int nblocks = blocks.size();
+  ndim_ = ndim;
+  block_data_.resize(nblocks);
+  SetMeshPointer(pmesh);
+  for (int i = 0; i < nblocks; i++) {
+    block_data_[i] = blocks[i]->meshblock_data.Get(stage_name_);
+  }
+}
+
+template <typename T>
+void MeshData<T>::Set(BlockList_t blocks, Mesh *pmesh) {
+  int ndim;
+  if (pmesh != nullptr) {
+    ndim = pmesh->ndim;
+  }
+  Set(blocks, pmesh, ndim);
+}
+
 template class MeshData<Real>;
 
 } // namespace parthenon
diff --git a/src/interface/mesh_data.hpp b/src/interface/mesh_data.hpp
index c1da0f13e614..07726e864f56 100644
--- a/src/interface/mesh_data.hpp
+++ b/src/interface/mesh_data.hpp
@@ -246,15 +246,8 @@ class MeshData {
     }
   }
 
-  void Set(BlockList_t blocks, Mesh *pmesh) {
-    const int nblocks = blocks.size();
-    block_data_.resize(nblocks);
-    SetMeshPointer(pmesh);
-    for (int i = 0; i < nblocks; i++) {
-      block_data_[i] = blocks[i]->meshblock_data.Get(stage_name_);
-    }
-  }
-
+  void Set(BlockList_t blocks, Mesh *pmesh, int ndim);
+  void Set(BlockList_t blocks, Mesh *pmesh);
   void Initialize(const MeshData<T> *src, const std::vector<std::string> &names,
                   const bool shallow);
 
@@ -419,6 +412,7 @@ class MeshData {
     bvars_cache_.clear();
   }
 
+  int GetNDim() const { return ndim_; }
   int NumBlocks() const { return block_data_.size(); }
 
   bool operator==(MeshData<T> &cmp) const {
@@ -442,6 +436,7 @@ class MeshData {
   SparsePackCache &GetSparsePackCache() { return sparse_pack_cache_; }
 
  private:
+  int ndim_;
   Mesh *pmy_mesh_;
   BlockDataList_t<T> block_data_;
   std::string stage_name_;
diff --git a/src/interface/meshblock_data.cpp b/src/interface/meshblock_data.cpp
index 1b194597911b..5f685c0e119d 100644
--- a/src/interface/meshblock_data.cpp
+++ b/src/interface/meshblock_data.cpp
@@ -327,7 +327,7 @@ template <typename T>
 typename MeshBlockData<T>::VarList
 MeshBlockData<T>::GetVariablesByFlag(const Metadata::FlagCollection &flags,
                                      const std::vector<int> &sparse_ids) {
-  Kokkos::Profiling::pushRegion("GetVariablesByFlag");
+  PARTHENON_INSTRUMENT
 
   typename MeshBlockData<T>::VarList var_list;
   std::unordered_set<int> sparse_ids_set(sparse_ids.begin(), sparse_ids.end());
@@ -338,7 +338,6 @@ MeshBlockData<T>::GetVariablesByFlag(const Metadata::FlagCollection &flags,
     var_list.Add(v, sparse_ids_set);
   }
 
-  Kokkos::Profiling::popRegion(); // GetVariablesByFlag
   return var_list;
 }
 
diff --git a/src/interface/metadata.cpp b/src/interface/metadata.cpp
index d91147af2491..0becfde5cd94 100644
--- a/src/interface/metadata.cpp
+++ b/src/interface/metadata.cpp
@@ -103,7 +103,8 @@ MetadataFlag Metadata::GetUserFlag(const std::string &flagname) {
 namespace parthenon {
 Metadata::Metadata(const std::vector<MetadataFlag> &bits, const std::vector<int> &shape,
                    const std::vector<std::string> &component_labels,
-                   const std::string &associated)
+                   const std::string &associated,
+                   const refinement::RefinementFunctions_t ref_funcs_)
     : shape_(shape), component_labels_(component_labels), associated_(associated) {
   // set flags
   for (const auto f : bits) {
@@ -126,8 +127,7 @@ Metadata::Metadata(const std::vector<MetadataFlag> &bits, const std::vector<int>
   // If variable is refined, set a default prolongation/restriction op
   // TODO(JMM): This is dangerous. See Issue #844.
   if (IsRefined()) {
-    refinement_funcs_ = refinement::RefinementFunctions_t::RegisterOps<
-        refinement_ops::ProlongateSharedMinMod, refinement_ops::RestrictAverage>();
+    refinement_funcs_ = ref_funcs_;
   }
 
   // check if all flag constraints are satisfied, throw if not
diff --git a/src/interface/metadata.hpp b/src/interface/metadata.hpp
index 7a8658164ee3..707f8a6bd917 100644
--- a/src/interface/metadata.hpp
+++ b/src/interface/metadata.hpp
@@ -316,9 +316,13 @@ class Metadata {
 
   // 4 constructors, this is the general constructor called by all other constructors, so
   // we do some sanity checks here
-  Metadata(const std::vector<MetadataFlag> &bits, const std::vector<int> &shape = {},
-           const std::vector<std::string> &component_labels = {},
-           const std::string &associated = "");
+  Metadata(
+      const std::vector<MetadataFlag> &bits, const std::vector<int> &shape = {},
+      const std::vector<std::string> &component_labels = {},
+      const std::string &associated = "",
+      const refinement::RefinementFunctions_t ref_funcs_ =
+          refinement::RefinementFunctions_t::RegisterOps<
+              refinement_ops::ProlongateSharedMinMod, refinement_ops::RestrictAverage>());
 
   // 1 constructor
   Metadata(const std::vector<MetadataFlag> &bits, const std::vector<int> &shape,
@@ -548,7 +552,6 @@ class Metadata {
 
   // Refinement stuff
   const refinement::RefinementFunctions_t &GetRefinementFunctions() const {
-    PARTHENON_REQUIRE_THROWS(IsRefined(), "Variable must be registered for refinement");
     return refinement_funcs_;
   }
   template <class ProlongationOp, class RestrictionOp,
diff --git a/src/interface/params.cpp b/src/interface/params.cpp
index 64faeb12fc53..6d452ae0642a 100644
--- a/src/interface/params.cpp
+++ b/src/interface/params.cpp
@@ -26,19 +26,6 @@
 
 namespace parthenon {
 
-// JMM: This could probably be done with template magic but I think
-// using a macro is honestly the simplest and cleanest solution here.
-// Template solution would be to define a variatic class to conain the
-// list of types and then a hierarchy of structs/functions to turn
-// that into function calls. Preprocessor seems easier, given we're
-// not manipulating this list in any way.
-#define VALID_VEC_TYPES(T)                                                               \
-  T, std::vector<T>, ParArray1D<T>, ParArray2D<T>, ParArray3D<T>, ParArray4D<T>,         \
-      ParArray5D<T>, ParArray6D<T>, ParArray7D<T>, ParArray8D<T>, HostArray1D<T>,        \
-      HostArray2D<T>, HostArray3D<T>, HostArray4D<T>, HostArray5D<T>, HostArray6D<T>,    \
-      HostArray7D<T>, Kokkos::View<T *>, Kokkos::View<T **>, ParArrayND<T>,              \
-      ParArrayHost<T>
-
 #ifdef ENABLE_HDF5
 
 template <typename T>
@@ -63,7 +50,7 @@ void Params::WriteToHDF5AllParamsOfMultipleTypes(const std::string &prefix,
 template <typename T>
 void Params::WriteToHDF5AllParamsOfTypeOrVec(const std::string &prefix,
                                              const HDF5::H5G &group) const {
-  WriteToHDF5AllParamsOfMultipleTypes<VALID_VEC_TYPES(T)>(prefix, group);
+  WriteToHDF5AllParamsOfMultipleTypes<PARTHENON_ATTR_VALID_VEC_TYPES(T)>(prefix, group);
 }
 
 template <typename T>
@@ -91,7 +78,7 @@ void Params::ReadFromHDF5AllParamsOfMultipleTypes(const std::string &prefix,
 template <typename T>
 void Params::ReadFromHDF5AllParamsOfTypeOrVec(const std::string &prefix,
                                               const HDF5::H5G &group) {
-  ReadFromHDF5AllParamsOfMultipleTypes<VALID_VEC_TYPES(T)>(prefix, group);
+  ReadFromHDF5AllParamsOfMultipleTypes<PARTHENON_ATTR_VALID_VEC_TYPES(T)>(prefix, group);
 }
 
 void Params::WriteAllToHDF5(const std::string &prefix, const HDF5::H5G &group) const {
diff --git a/src/interface/params.hpp b/src/interface/params.hpp
index ce27dce50e2d..4b2f4a09d0cd 100644
--- a/src/interface/params.hpp
+++ b/src/interface/params.hpp
@@ -26,7 +26,7 @@
 #include "utils/error_checking.hpp"
 
 #ifdef ENABLE_HDF5
-#include "outputs/parthenon_hdf5.hpp"
+#include "outputs/parthenon_hdf5_types.hpp"
 #endif
 
 namespace parthenon {
diff --git a/src/interface/sparse_pack.hpp b/src/interface/sparse_pack.hpp
index 9a98d8ba980a..d5fdf37a2a30 100644
--- a/src/interface/sparse_pack.hpp
+++ b/src/interface/sparse_pack.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -90,6 +90,7 @@ KOKKOS_INLINE_FUNCTION PackIdx operator+(T offset, PackIdx idx) {
 // device
 namespace variable_names {
 // Struct that all variable_name types should inherit from
+constexpr int ANYDIM = -1234; // ANYDIM must be a slowest-moving index
 template <bool REGEX, int... NCOMP>
 struct base_t {
   KOKKOS_INLINE_FUNCTION
@@ -98,6 +99,21 @@ struct base_t {
   KOKKOS_INLINE_FUNCTION
   explicit base_t(int idx1) : idx(idx1) {}
 
+  /*
+    for 2D:, (M, N),
+    idx(m, n) = N*m + n
+    for 3D: (L, M, N)
+    idx(l, m, n) = (M*l + m)*N + n
+                 = l*M*N + m*N + n
+   */
+  template <typename... Args, REQUIRES(all_implement<integral(Args...)>::value),
+            REQUIRES(sizeof...(Args) == sizeof...(NCOMP))>
+  KOKKOS_INLINE_FUNCTION explicit base_t(Args... args)
+      : idx(GetIndex_(std::forward<Args>(args)...)) {
+    static_assert(CheckArgs_(NCOMP...),
+                  "All dimensions must be strictly positive, "
+                  "except the first (slowest), which may be ANYDIM.");
+  }
   virtual ~base_t() = default;
 
   // All of these are just static methods so that there is no
@@ -106,6 +122,11 @@ struct base_t {
     PARTHENON_FAIL("Need to implement your own name method.");
     return "error";
   }
+  template <int idx>
+  static constexpr auto GetDim() {
+    return std::get<sizeof...(NCOMP) - idx>(std::make_tuple(NCOMP...));
+  }
+  static std::vector<int> GetShape() { return std::vector<int>{NCOMP...}; }
   KOKKOS_INLINE_FUNCTION
   static bool regex() { return REGEX; }
   KOKKOS_INLINE_FUNCTION
@@ -114,6 +135,23 @@ struct base_t {
   static int size() { return multiply<NCOMP...>::value; }
 
   const int idx;
+
+ private:
+  template <typename... Tail, REQUIRES(all_implement<integral(Tail...)>::value)>
+  static constexpr bool CheckArgs_(int head, Tail... tail) {
+    return (... && (tail > 0));
+  }
+  template <class... Args>
+  KOKKOS_INLINE_FUNCTION static auto GetIndex_(Args... args) {
+    int idx = 0;
+    (
+        [&] {
+          idx *= NCOMP;
+          idx += args;
+        }(),
+        ...);
+    return idx;
+  }
 };
 
 // An example variable name type that selects all variables available
@@ -267,6 +305,45 @@ class SparsePack : public SparsePackBase {
     return bounds_h_(1, b, vidx);
   }
 
+  /* Usage:
+   * Contains(b, v1(), v2(), v3())
+   *
+   * returns true if all listed vars are present on block b, false
+   * otherwise.
+   */
+  KOKKOS_INLINE_FUNCTION bool Contains(const int b) const {
+    return GetUpperBound(b) >= 0;
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION bool Contains(const int b, const T t) const {
+    return GetUpperBound(b, t) >= 0;
+  }
+  template <typename... Args>
+  KOKKOS_INLINE_FUNCTION bool Contains(const int b, Args... args) const {
+    return (... && Contains(b, args));
+  }
+  // Version that takes templates but no arguments passed
+  template <typename... Args, REQUIRES(sizeof...(Args) > 0)>
+  KOKKOS_INLINE_FUNCTION bool Contains(const int b) const {
+    return (... && Contains(b, Args()));
+  }
+  // Host versions
+  KOKKOS_INLINE_FUNCTION bool ContainsHost(const int b) const {
+    return GetUpperBoundHost(b) >= 0;
+  }
+  template <typename T>
+  KOKKOS_INLINE_FUNCTION bool ContainsHost(const int b, const T t) const {
+    return GetUpperBoundHost(b, t) >= 0;
+  }
+  template <typename... Args>
+  KOKKOS_INLINE_FUNCTION bool ContainsHost(const int b, Args... args) const {
+    return (... && ContainsHost(b, args));
+  }
+  template <typename... Args, REQUIRES(sizeof...(Args) > 0)>
+  KOKKOS_INLINE_FUNCTION bool ContainsHost(const int b) const {
+    return (... && ContainsHost(b, Args()));
+  }
+
   // operator() overloads
   using TE = TopologicalElement;
   KOKKOS_INLINE_FUNCTION auto &operator()(const int b, const TE el, const int idx) const {
diff --git a/src/interface/sparse_pool.cpp b/src/interface/sparse_pool.cpp
index 318a8fdb5cfc..8a6e0ef213ca 100644
--- a/src/interface/sparse_pool.cpp
+++ b/src/interface/sparse_pool.cpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -59,7 +59,7 @@ const Metadata &SparsePool::AddImpl(int sparse_id, const std::vector<int> &shape
       shared_metadata_.Flags(), shape.size() > 0 ? shape : shared_metadata_.Shape(),
       component_labels.size() > 0 ? component_labels
                                   : shared_metadata_.getComponentLabels(),
-      shared_metadata_.getAssociated());
+      shared_metadata_.getAssociated(), shared_metadata_.GetRefinementFunctions());
 
   this_metadata.SetSparseThresholds(shared_metadata_.GetAllocationThreshold(),
                                     shared_metadata_.GetDeallocationThreshold(),
diff --git a/src/interface/swarm.cpp b/src/interface/swarm.cpp
index c9db0ced5478..f9eed4fd9647 100644
--- a/src/interface/swarm.cpp
+++ b/src/interface/swarm.cpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -28,11 +28,11 @@ SwarmDeviceContext Swarm::GetDeviceContext() const {
   SwarmDeviceContext context;
   context.marked_for_removal_ = marked_for_removal_;
   context.mask_ = mask_;
-  context.blockIndex_ = blockIndex_;
-  context.neighborIndices_ = neighborIndices_;
-  context.cellSorted_ = cellSorted_;
-  context.cellSortedBegin_ = cellSortedBegin_;
-  context.cellSortedNumber_ = cellSortedNumber_;
+  context.block_index_ = block_index_;
+  context.neighbor_indices_ = neighbor_indices_;
+  context.cell_sorted_ = cell_sorted_;
+  context.cell_sorted_begin_ = cell_sorted_begin_;
+  context.cell_sorted_number_ = cell_sorted_number_;
 
   auto pmb = GetBlockPointer();
   auto pmesh = pmb->pmy_mesh;
@@ -64,9 +64,14 @@ SwarmDeviceContext Swarm::GetDeviceContext() const {
 
 Swarm::Swarm(const std::string &label, const Metadata &metadata, const int nmax_pool_in)
     : label_(label), m_(metadata), nmax_pool_(nmax_pool_in), mask_("mask", nmax_pool_),
-      marked_for_removal_("mfr", nmax_pool_), blockIndex_("blockIndex_", nmax_pool_),
-      neighborIndices_("neighborIndices_", 4, 4, 4),
-      cellSorted_("cellSorted_", nmax_pool_), mpiStatus(true) {
+      marked_for_removal_("mfr", nmax_pool_), block_index_("block_index_", nmax_pool_),
+      neighbor_indices_("neighbor_indices_", 4, 4, 4),
+      new_indices_("new_indices_", nmax_pool_),
+      from_to_indices_("from_to_indices_", nmax_pool_ + 1),
+      recv_neighbor_index_("recv_neighbor_index_", nmax_pool_),
+      recv_buffer_index_("recv_buffer_index_", nmax_pool_),
+      num_particles_to_send_("num_particles_to_send_", NMAX_NEIGHBORS),
+      cell_sorted_("cell_sorted_", nmax_pool_), mpiStatus(true) {
   PARTHENON_REQUIRE_THROWS(typeid(Coordinates_t) == typeid(UniformCartesian),
                            "SwarmDeviceContext only supports a uniform Cartesian mesh!");
 
@@ -78,6 +83,7 @@ Swarm::Swarm(const std::string &label, const Metadata &metadata, const int nmax_
   num_active_ = 0;
   max_active_index_ = 0;
 
+  // TODO(BRR) Do this in a device kernel?
   auto mask_h = Kokkos::create_mirror_view(HostMemSpace(), mask_);
   auto marked_for_removal_h =
       Kokkos::create_mirror_view(HostMemSpace(), marked_for_removal_);
@@ -92,62 +98,9 @@ Swarm::Swarm(const std::string &label, const Metadata &metadata, const int nmax_
   Kokkos::deep_copy(marked_for_removal_, marked_for_removal_h);
 }
 
-template <class BOutflow, class BPeriodic, int iFace>
-void Swarm::AllocateBoundariesImpl_(MeshBlock *pmb) {
-  std::stringstream msg;
-  auto &bcs = pmb->pmy_mesh->mesh_bcs;
-  if (bcs[iFace] == BoundaryFlag::outflow) {
-    bounds_uptrs[iFace] = DeviceAllocate<BOutflow>();
-  } else if (bcs[iFace] == BoundaryFlag::periodic) {
-    bounds_uptrs[iFace] = DeviceAllocate<BPeriodic>();
-  } else if (bcs[iFace] == BoundaryFlag::user) {
-    if (pmb->pmy_mesh->SwarmBndryFnctn[iFace] != nullptr) {
-      bounds_uptrs[iFace] = pmb->pmy_mesh->SwarmBndryFnctn[iFace]();
-    } else {
-      msg << (iFace % 2 == 0 ? "i" : "o") << "x" << iFace / 2 + 1
-          << " user boundary requested but provided function is null!";
-      PARTHENON_THROW(msg);
-    }
-  } else {
-    msg << (iFace % 2 == 0 ? "i" : "o") << "x" << iFace / 2 + 1 << " boundary flag "
-        << static_cast<int>(bcs[iFace]) << " not supported!";
-    PARTHENON_THROW(msg);
-  }
-}
-
-void Swarm::AllocateBoundaries() {
-  auto pmb = GetBlockPointer();
-  std::stringstream msg;
-
-  auto &bcs = pmb->pmy_mesh->mesh_bcs;
-
-  AllocateBoundariesImpl_<ParticleBoundIX1Outflow, ParticleBoundIX1Periodic, 0>(
-      pmb.get());
-  AllocateBoundariesImpl_<ParticleBoundOX1Outflow, ParticleBoundOX1Periodic, 1>(
-      pmb.get());
-  AllocateBoundariesImpl_<ParticleBoundIX2Outflow, ParticleBoundIX2Periodic, 2>(
-      pmb.get());
-  AllocateBoundariesImpl_<ParticleBoundOX2Outflow, ParticleBoundOX2Periodic, 3>(
-      pmb.get());
-  AllocateBoundariesImpl_<ParticleBoundIX3Outflow, ParticleBoundIX3Periodic, 4>(
-      pmb.get());
-  AllocateBoundariesImpl_<ParticleBoundOX3Outflow, ParticleBoundOX3Periodic, 5>(
-      pmb.get());
-
-  for (int n = 0; n < 6; n++) {
-    bounds_d.bounds[n] = bounds_uptrs[n].get();
-    std::stringstream msg;
-    msg << "Boundary condition on face " << n << " missing.\n"
-        << "Please set it to `outflow`, `periodic`, or `user` in the input deck.\n"
-        << "If you set it to user, you must also manually set "
-        << "the swarm boundary pointer in your application." << std::endl;
-    PARTHENON_REQUIRE(bounds_d.bounds[n] != nullptr, msg);
-  }
-}
-
-void Swarm::Add(const std::vector<std::string> &labelArray, const Metadata &metadata) {
+void Swarm::Add(const std::vector<std::string> &label_array, const Metadata &metadata) {
   // generate the vector and call Add
-  for (auto label : labelArray) {
+  for (auto label : label_array) {
     Add(label, metadata);
   }
 }
@@ -168,8 +121,8 @@ std::shared_ptr<Swarm> Swarm::AllocateCopy(MeshBlock * /*pmb*/) {
 void Swarm::Add(const std::string &label, const Metadata &metadata) {
   // labels must be unique, even between different types of data
   //  if (intMap_.count(label) > 0 || realMap_.count(label) > 0) {
-  if (std::get<getType<int>()>(Maps_).count(label) > 0 ||
-      std::get<getType<Real>()>(Maps_).count(label) > 0) {
+  if (std::get<getType<int>()>(maps_).count(label) > 0 ||
+      std::get<getType<Real>()>(maps_).count(label) > 0) {
     throw std::invalid_argument("swarm variable " + label +
                                 " already enrolled during Add()!");
   }
@@ -194,14 +147,14 @@ void Swarm::Add(const std::string &label, const Metadata &metadata) {
 void Swarm::Remove(const std::string &label) {
   bool found = false;
 
-  auto &intMap_ = std::get<getType<int>()>(Maps_);
-  auto &intVector_ = std::get<getType<int>()>(Vectors_);
-  auto &realMap_ = std::get<getType<Real>()>(Maps_);
-  auto &realVector_ = std::get<getType<Real>()>(Vectors_);
+  auto &int_map = std::get<getType<int>()>(maps_);
+  auto &int_vector = std::get<getType<int>()>(vectors_);
+  auto &real_map = std::get<getType<Real>()>(maps_);
+  auto &real_vector = std::get<getType<Real>()>(vectors_);
 
   // Find index of variable
   int idx = 0;
-  for (auto v : intVector_) {
+  for (auto v : int_vector) {
     if (label == v->label()) {
       found = true;
       break;
@@ -210,19 +163,19 @@ void Swarm::Remove(const std::string &label) {
   }
   if (found == true) {
     // first delete the variable
-    intVector_[idx].reset();
+    int_vector[idx].reset();
 
     // Next move the last element into idx and pop last entry
-    if (intVector_.size() > 1) intVector_[idx] = std::move(intVector_.back());
-    intVector_.pop_back();
+    if (int_vector.size() > 1) int_vector[idx] = std::move(int_vector.back());
+    int_vector.pop_back();
 
     // Also remove variable from map
-    intMap_.erase(label);
+    int_map.erase(label);
   }
 
   if (found == false) {
     idx = 0;
-    for (const auto &v : realVector_) {
+    for (const auto &v : real_vector) {
       if (label == v->label()) {
         found = true;
         break;
@@ -231,10 +184,10 @@ void Swarm::Remove(const std::string &label) {
     }
   }
   if (found == true) {
-    realVector_[idx].reset();
-    if (realVector_.size() > 1) realVector_[idx] = std::move(realVector_.back());
-    realVector_.pop_back();
-    realMap_.erase(label);
+    real_vector[idx].reset();
+    if (real_vector.size() > 1) real_vector[idx] = std::move(real_vector.back());
+    real_vector.pop_back();
+    real_map.erase(label);
   }
 
   if (found == false) {
@@ -256,24 +209,28 @@ void Swarm::setPoolMax(const std::int64_t nmax_pool) {
   // Rely on Kokkos setting the newly added values to false for these arrays
   Kokkos::resize(mask_, nmax_pool);
   Kokkos::resize(marked_for_removal_, nmax_pool);
+  Kokkos::resize(new_indices_, nmax_pool);
+  Kokkos::resize(from_to_indices_, nmax_pool + 1);
+  Kokkos::resize(recv_neighbor_index_, nmax_pool);
+  Kokkos::resize(recv_buffer_index_, nmax_pool);
   pmb->LogMemUsage(2 * n_new * sizeof(bool));
 
-  Kokkos::resize(cellSorted_, nmax_pool);
+  Kokkos::resize(cell_sorted_, nmax_pool);
   pmb->LogMemUsage(n_new * sizeof(SwarmKey));
 
-  blockIndex_.Resize(nmax_pool);
+  block_index_.Resize(nmax_pool);
   pmb->LogMemUsage(n_new * sizeof(int));
 
-  auto &intVector_ = std::get<getType<int>()>(Vectors_);
-  auto &realVector_ = std::get<getType<Real>()>(Vectors_);
+  auto &int_vector = std::get<getType<int>()>(vectors_);
+  auto &real_vector = std::get<getType<Real>()>(vectors_);
 
-  for (auto &d : intVector_) {
+  for (auto &d : int_vector) {
     d->data.Resize(d->data.GetDim(6), d->data.GetDim(5), d->data.GetDim(4),
                    d->data.GetDim(3), d->data.GetDim(2), nmax_pool);
     pmb->LogMemUsage(n_new * sizeof(int));
   }
 
-  for (auto &d : realVector_) {
+  for (auto &d : real_vector) {
     d->data.Resize(d->data.GetDim(6), d->data.GetDim(5), d->data.GetDim(4),
                    d->data.GetDim(3), d->data.GetDim(2), nmax_pool);
     pmb->LogMemUsage(n_new * sizeof(Real));
@@ -282,58 +239,52 @@ void Swarm::setPoolMax(const std::int64_t nmax_pool) {
   nmax_pool_ = nmax_pool;
 }
 
-ParArray1D<bool> Swarm::AddEmptyParticles(const int num_to_add,
-                                          ParArrayND<int> &new_indices) {
-  if (num_to_add <= 0) {
-    new_indices = ParArrayND<int>();
-    return ParArray1D<bool>();
-  }
+NewParticlesContext Swarm::AddEmptyParticles(const int num_to_add) {
+  PARTHENON_DEBUG_REQUIRE(num_to_add >= 0, "Cannot add negative numbers of particles!");
 
-  while (free_indices_.size() < num_to_add) {
-    increasePoolMax();
-  }
-
-  ParArray1D<bool> new_mask("Newly created particles", nmax_pool_);
-  auto new_mask_h = Kokkos::create_mirror_view(HostMemSpace(), new_mask);
-  for (int n = 0; n < nmax_pool_; n++) {
-    new_mask_h(n) = false;
-  }
+  if (num_to_add > 0) {
+    while (free_indices_.size() < num_to_add) {
+      increasePoolMax();
+    }
 
-  auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
+    // TODO(BRR) Use par_scan on device rather than do this on host
+    auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
 
-  auto blockIndex_h = blockIndex_.GetHostMirrorAndCopy();
+    auto block_index_h = block_index_.GetHostMirrorAndCopy();
 
-  auto free_index = free_indices_.begin();
+    auto free_index = free_indices_.begin();
 
-  new_indices = ParArrayND<int>("New indices", num_to_add);
-  auto new_indices_h = new_indices.GetHostMirror();
+    auto new_indices_h = new_indices_.GetHostMirror();
 
-  // Don't bother sanitizing the memory
-  for (int n = 0; n < num_to_add; n++) {
-    mask_h(*free_index) = true;
-    new_mask_h(*free_index) = true;
-    blockIndex_h(*free_index) = this_block_;
-    max_active_index_ = std::max<int>(max_active_index_, *free_index);
-    new_indices_h(n) = *free_index;
+    // Don't bother sanitizing the memory
+    for (int n = 0; n < num_to_add; n++) {
+      mask_h(*free_index) = true;
+      block_index_h(*free_index) = this_block_;
+      max_active_index_ = std::max<int>(max_active_index_, *free_index);
+      new_indices_h(n) = *free_index;
 
-    free_index = free_indices_.erase(free_index);
-  }
+      free_index = free_indices_.erase(free_index);
+    }
 
-  new_indices.DeepCopy(new_indices_h);
+    new_indices_.DeepCopy(new_indices_h);
 
-  num_active_ += num_to_add;
+    num_active_ += num_to_add;
 
-  Kokkos::deep_copy(new_mask, new_mask_h);
-  Kokkos::deep_copy(mask_, mask_h);
-  blockIndex_.DeepCopy(blockIndex_h);
+    Kokkos::deep_copy(mask_, mask_h);
+    block_index_.DeepCopy(block_index_h);
+    new_indices_max_idx_ = num_to_add - 1;
+  } else {
+    new_indices_max_idx_ = -1;
+  }
 
-  return new_mask;
+  return NewParticlesContext(new_indices_max_idx_, new_indices_);
 }
 
 // No active particles: nmax_active_index = -1
 // No particles removed: nmax_active_index unchanged
 // Particles removed: nmax_active_index is new max active index
 void Swarm::RemoveMarkedParticles() {
+  // TODO(BRR) Use par_scan to do this on device rather than host
   auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
   auto marked_for_removal_h =
       Kokkos::create_mirror_view_and_copy(HostMemSpace(), marked_for_removal_);
@@ -366,8 +317,7 @@ void Swarm::Defrag() {
   std::int64_t num_free = (max_active_index_ + 1) - num_active_;
   auto pmb = GetBlockPointer();
 
-  ParArrayND<int> from_to_indices("from_to_indices", max_active_index_ + 1);
-  auto from_to_indices_h = from_to_indices.GetHostMirror();
+  auto from_to_indices_h = from_to_indices_.GetHostMirror();
 
   auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
 
@@ -402,32 +352,34 @@ void Swarm::Defrag() {
   new_free_indices.sort();
   free_indices_.merge(new_free_indices);
 
-  from_to_indices.DeepCopy(from_to_indices_h);
+  from_to_indices_.DeepCopy(from_to_indices_h);
+
+  auto from_to_indices = from_to_indices_;
 
   auto &mask = mask_;
   pmb->par_for(
-      "Swarm::DefragMask", 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
         if (from_to_indices(n) >= 0) {
           mask(from_to_indices(n)) = mask(n);
           mask(n) = false;
         }
       });
 
-  auto &intVector_ = std::get<getType<int>()>(Vectors_);
-  auto &realVector_ = std::get<getType<Real>()>(Vectors_);
+  auto &int_vector = std::get<getType<int>()>(vectors_);
+  auto &real_vector = std::get<getType<Real>()>(vectors_);
   PackIndexMap real_imap;
   PackIndexMap int_imap;
   auto vreal = PackAllVariables_<Real>(real_imap);
   auto vint = PackAllVariables_<int>(int_imap);
-  int real_vars_size = realVector_.size();
-  int int_vars_size = intVector_.size();
+  int real_vars_size = real_vector.size();
+  int int_vars_size = int_vector.size();
   auto real_map = real_imap.Map();
   auto int_map = int_imap.Map();
   const int realPackDim = vreal.GetDim(2);
   const int intPackDim = vint.GetDim(2);
 
   pmb->par_for(
-      "Swarm::DefragVariables", 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
         if (from_to_indices(n) >= 0) {
           for (int vidx = 0; vidx < realPackDim; vidx++) {
             vreal(vidx, from_to_indices(n)) = vreal(vidx, n);
@@ -444,9 +396,9 @@ void Swarm::Defrag() {
 
 ///
 /// Routine to sort particles by cell. Updates internal swarm variables:
-///  cellSorted_: 1D Per-cell sorted array of swarm memory indices
-///  (SwarmKey::swarm_index_) cellSortedBegin_: Per-cell array of starting indices in
-///  cellSorted_ cellSortedNumber_: Per-cell array of number of particles in each cell
+///  cell_sorted_: 1D Per-cell sorted array of swarm memory indices
+///  (SwarmKey::swarm_index_) cell_sorted_begin_: Per-cell array of starting indices in
+///  cell_sorted_ cell_sorted_number_: Per-cell array of number of particles in each cell
 ///
 void Swarm::SortParticlesByCell() {
   auto pmb = GetBlockPointer();
@@ -461,37 +413,37 @@ void Swarm::SortParticlesByCell() {
   PARTHENON_REQUIRE(nx1 * nx2 * nx3 < std::numeric_limits<int>::max(),
                     "Too many cells for an int32 to store cell_idx_1d below!");
 
-  auto cellSorted = cellSorted_;
+  auto cell_sorted = cell_sorted_;
   int ncells = pmb->cellbounds.GetTotal(IndexDomain::entire);
   int num_active = num_active_;
   int max_active_index = max_active_index_;
 
   // Allocate data if necessary
-  if (cellSortedBegin_.GetDim(1) == 0) {
-    cellSortedBegin_ = ParArrayND<int>("cellSortedBegin_", nx3, nx2, nx1);
-    cellSortedNumber_ = ParArrayND<int>("cellSortedNumber_", nx3, nx2, nx1);
+  if (cell_sorted_begin_.GetDim(1) == 0) {
+    cell_sorted_begin_ = ParArrayND<int>("cell_sorted_begin_", nx3, nx2, nx1);
+    cell_sorted_number_ = ParArrayND<int>("cell_sorted_number_", nx3, nx2, nx1);
   }
-  auto cellSortedBegin = cellSortedBegin_;
-  auto cellSortedNumber = cellSortedNumber_;
+  auto cell_sorted_begin = cell_sorted_begin_;
+  auto cell_sorted_number = cell_sorted_number_;
   auto swarm_d = GetDeviceContext();
 
   // Write an unsorted list
   pmb->par_for(
-      "Write unsorted list", 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
+      PARTHENON_AUTO_LABEL, 0, max_active_index_, KOKKOS_LAMBDA(const int n) {
         int i, j, k;
         swarm_d.Xtoijk(x(n), y(n), z(n), i, j, k);
         const int64_t cell_idx_1d = i + nx1 * (j + nx2 * k);
-        cellSorted(n) = SwarmKey(static_cast<int>(cell_idx_1d), n);
+        cell_sorted(n) = SwarmKey(static_cast<int>(cell_idx_1d), n);
       });
 
-  sort(cellSorted, SwarmKeyComparator(), 0, max_active_index);
+  sort(cell_sorted, SwarmKeyComparator(), 0, max_active_index);
 
   // Update per-cell arrays for easier accessing later
   const IndexRange &ib = pmb->cellbounds.GetBoundsI(IndexDomain::entire);
   const IndexRange &jb = pmb->cellbounds.GetBoundsJ(IndexDomain::entire);
   const IndexRange &kb = pmb->cellbounds.GetBoundsK(IndexDomain::entire);
   pmb->par_for(
-      "Update per-cell arrays", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int k, const int j, const int i) {
         int cell_idx_1d = i + nx1 * (j + nx2 * k);
         // Find starting index, first by guessing
@@ -506,711 +458,55 @@ void Swarm::SortParticlesByCell() {
             break;
           }
 
-          if (cellSorted(start_index).cell_idx_1d_ == cell_idx_1d) {
+          if (cell_sorted(start_index).cell_idx_1d_ == cell_idx_1d) {
             if (start_index == 0) {
               break;
-            } else if (cellSorted(start_index - 1).cell_idx_1d_ != cell_idx_1d) {
+            } else if (cell_sorted(start_index - 1).cell_idx_1d_ != cell_idx_1d) {
               break;
             } else {
               start_index--;
               continue;
             }
           }
-          if (cellSorted(start_index).cell_idx_1d_ >= cell_idx_1d) {
+          if (cell_sorted(start_index).cell_idx_1d_ >= cell_idx_1d) {
             start_index--;
             if (start_index < 0) {
               start_index = -1;
               break;
             }
-            if (cellSorted(start_index).cell_idx_1d_ < cell_idx_1d) {
+            if (cell_sorted(start_index).cell_idx_1d_ < cell_idx_1d) {
               start_index = -1;
               break;
             }
             continue;
           }
-          if (cellSorted(start_index).cell_idx_1d_ < cell_idx_1d) {
+          if (cell_sorted(start_index).cell_idx_1d_ < cell_idx_1d) {
             start_index++;
             if (start_index > max_active_index) {
               start_index = -1;
               break;
             }
-            if (cellSorted(start_index).cell_idx_1d_ > cell_idx_1d) {
+            if (cell_sorted(start_index).cell_idx_1d_ > cell_idx_1d) {
               start_index = -1;
               break;
             }
             continue;
           }
         }
-        cellSortedBegin(k, j, i) = start_index;
+        cell_sorted_begin(k, j, i) = start_index;
         if (start_index == -1) {
-          cellSortedNumber(k, j, i) = 0;
+          cell_sorted_number(k, j, i) = 0;
         } else {
           int number = 0;
           int current_index = start_index;
           while (current_index <= max_active_index &&
-                 cellSorted(current_index).cell_idx_1d_ == cell_idx_1d) {
+                 cell_sorted(current_index).cell_idx_1d_ == cell_idx_1d) {
             current_index++;
             number++;
-            cellSortedNumber(k, j, i) = number;
-          }
-        }
-      });
-}
-
-///
-/// Routine for precomputing neighbor indices to efficiently compute particle
-/// position in terms of neighbor blocks based on spatial position. See
-/// GetNeighborBlockIndex()
-///
-void Swarm::SetNeighborIndices1D_() {
-  auto pmb = GetBlockPointer();
-  const int ndim = pmb->pmy_mesh->ndim;
-  auto neighborIndices_h = neighborIndices_.GetHostMirror();
-
-  // Initialize array in event of zero neighbors
-  for (int k = 0; k < 4; k++) {
-    for (int j = 0; j < 4; j++) {
-      for (int i = 0; i < 4; i++) {
-        neighborIndices_h(k, j, i) = no_block_;
-      }
-    }
-  }
-
-  // Indicate which neighbor regions correspond to this meshblock
-  const int kmin = 0;
-  const int kmax = 4;
-  const int jmin = 0;
-  const int jmax = 4;
-  const int imin = 1;
-  const int imax = 3;
-  for (int k = kmin; k < kmax; k++) {
-    for (int j = jmin; j < jmax; j++) {
-      for (int i = imin; i < imax; i++) {
-        neighborIndices_h(k, j, i) = this_block_;
-      }
-    }
-  }
-
-  auto mesh_bcs = pmb->pmy_mesh->mesh_bcs;
-  // Indicate which neighbor regions correspond to each neighbor meshblock
-  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
-    NeighborBlock &nb = pmb->pbval->neighbor[n];
-
-    const int i = nb.ni.ox1;
-
-    if (i == -1) {
-      neighborIndices_h(0, 0, 0) = n;
-    } else if (i == 0) {
-      neighborIndices_h(0, 0, 1) = n;
-      neighborIndices_h(0, 0, 2) = n;
-    } else {
-      neighborIndices_h(0, 0, 3) = n;
-    }
-  }
-
-  neighborIndices_.DeepCopy(neighborIndices_h);
-}
-
-void Swarm::SetNeighborIndices2D_() {
-  auto pmb = GetBlockPointer();
-  const int ndim = pmb->pmy_mesh->ndim;
-  auto neighborIndices_h = neighborIndices_.GetHostMirror();
-
-  // Initialize array in event of zero neighbors
-  for (int k = 0; k < 4; k++) {
-    for (int j = 0; j < 4; j++) {
-      for (int i = 0; i < 4; i++) {
-        neighborIndices_h(k, j, i) = no_block_;
-      }
-    }
-  }
-
-  // Indicate which neighbor regions correspond to this meshblock
-  const int kmin = 0;
-  const int kmax = 4;
-  const int jmin = 1;
-  const int jmax = 3;
-  const int imin = 1;
-  const int imax = 3;
-  for (int k = kmin; k < kmax; k++) {
-    for (int j = jmin; j < jmax; j++) {
-      for (int i = imin; i < imax; i++) {
-        neighborIndices_h(k, j, i) = this_block_;
-      }
-    }
-  }
-
-  // Indicate which neighbor regions correspond to each neighbor meshblock
-  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
-    NeighborBlock &nb = pmb->pbval->neighbor[n];
-
-    const int i = nb.ni.ox1;
-    const int j = nb.ni.ox2;
-
-    if (i == -1) {
-      if (j == -1) {
-        neighborIndices_h(0, 0, 0) = n;
-      } else if (j == 0) {
-        neighborIndices_h(0, 1, 0) = n;
-        neighborIndices_h(0, 2, 0) = n;
-      } else if (j == 1) {
-        neighborIndices_h(0, 3, 0) = n;
-      }
-    } else if (i == 0) {
-      if (j == -1) {
-        neighborIndices_h(0, 0, 1) = n;
-        neighborIndices_h(0, 0, 2) = n;
-      } else if (j == 1) {
-        neighborIndices_h(0, 3, 1) = n;
-        neighborIndices_h(0, 3, 2) = n;
-      }
-    } else if (i == 1) {
-      if (j == -1) {
-        neighborIndices_h(0, 0, 3) = n;
-      } else if (j == 0) {
-        neighborIndices_h(0, 1, 3) = n;
-        neighborIndices_h(0, 2, 3) = n;
-      } else if (j == 1) {
-        neighborIndices_h(0, 3, 3) = n;
-      }
-    }
-  }
-
-  neighborIndices_.DeepCopy(neighborIndices_h);
-}
-
-void Swarm::SetNeighborIndices3D_() {
-  auto pmb = GetBlockPointer();
-  const int ndim = pmb->pmy_mesh->ndim;
-  auto neighborIndices_h = neighborIndices_.GetHostMirror();
-
-  // Initialize array in event of zero neighbors
-  for (int k = 0; k < 4; k++) {
-    for (int j = 0; j < 4; j++) {
-      for (int i = 0; i < 4; i++) {
-        neighborIndices_h(k, j, i) = no_block_;
-      }
-    }
-  }
-
-  // Indicate which neighbor regions correspond to this meshblock
-  const int kmin = 1;
-  const int kmax = 3;
-  const int jmin = 1;
-  const int jmax = 3;
-  const int imin = 1;
-  const int imax = 3;
-  for (int k = kmin; k < kmax; k++) {
-    for (int j = jmin; j < jmax; j++) {
-      for (int i = imin; i < imax; i++) {
-        neighborIndices_h(k, j, i) = this_block_;
-      }
-    }
-  }
-
-  // Indicate which neighbor regions correspond to each neighbor meshblock
-  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
-    NeighborBlock &nb = pmb->pbval->neighbor[n];
-
-    const int i = nb.ni.ox1;
-    const int j = nb.ni.ox2;
-    const int k = nb.ni.ox3;
-
-    if (i == -1) {
-      if (j == -1) {
-        if (k == -1) {
-          neighborIndices_h(0, 0, 0) = n;
-        } else if (k == 0) {
-          neighborIndices_h(1, 0, 0) = n;
-          neighborIndices_h(2, 0, 0) = n;
-        } else if (k == 1) {
-          neighborIndices_h(3, 0, 0) = n;
-        }
-      } else if (j == 0) {
-        if (k == -1) {
-          neighborIndices_h(0, 1, 0) = n;
-          neighborIndices_h(0, 2, 0) = n;
-        } else if (k == 0) {
-          neighborIndices_h(1, 1, 0) = n;
-          neighborIndices_h(1, 2, 0) = n;
-          neighborIndices_h(2, 1, 0) = n;
-          neighborIndices_h(2, 2, 0) = n;
-        } else if (k == 1) {
-          neighborIndices_h(3, 1, 0) = n;
-          neighborIndices_h(3, 2, 0) = n;
-        }
-      } else if (j == 1) {
-        if (k == -1) {
-          neighborIndices_h(0, 3, 0) = n;
-        } else if (k == 0) {
-          neighborIndices_h(1, 3, 0) = n;
-          neighborIndices_h(2, 3, 0) = n;
-        } else if (k == 1) {
-          neighborIndices_h(3, 3, 0) = n;
-        }
-      }
-    } else if (i == 0) {
-      if (j == -1) {
-        if (k == -1) {
-          neighborIndices_h(0, 0, 1) = n;
-          neighborIndices_h(0, 0, 2) = n;
-        } else if (k == 0) {
-          neighborIndices_h(1, 0, 1) = n;
-          neighborIndices_h(1, 0, 2) = n;
-          neighborIndices_h(2, 0, 1) = n;
-          neighborIndices_h(2, 0, 2) = n;
-        } else if (k == 1) {
-          neighborIndices_h(3, 0, 1) = n;
-          neighborIndices_h(3, 0, 2) = n;
-        }
-      } else if (j == 0) {
-        if (k == -1) {
-          neighborIndices_h(0, 1, 1) = n;
-          neighborIndices_h(0, 1, 2) = n;
-          neighborIndices_h(0, 2, 1) = n;
-          neighborIndices_h(0, 2, 2) = n;
-        } else if (k == 1) {
-          neighborIndices_h(3, 1, 1) = n;
-          neighborIndices_h(3, 1, 2) = n;
-          neighborIndices_h(3, 2, 1) = n;
-          neighborIndices_h(3, 2, 2) = n;
-        }
-      } else if (j == 1) {
-        if (k == -1) {
-          neighborIndices_h(0, 3, 1) = n;
-          neighborIndices_h(0, 3, 2) = n;
-        } else if (k == 0) {
-          neighborIndices_h(1, 3, 1) = n;
-          neighborIndices_h(1, 3, 2) = n;
-          neighborIndices_h(2, 3, 1) = n;
-          neighborIndices_h(2, 3, 2) = n;
-        } else if (k == 1) {
-          neighborIndices_h(3, 3, 1) = n;
-          neighborIndices_h(3, 3, 2) = n;
-        }
-      }
-    } else if (i == 1) {
-      if (j == -1) {
-        if (k == -1) {
-          neighborIndices_h(0, 0, 3) = n;
-        } else if (k == 0) {
-          neighborIndices_h(1, 0, 3) = n;
-          neighborIndices_h(2, 0, 3) = n;
-        } else if (k == 1) {
-          neighborIndices_h(3, 0, 3) = n;
-        }
-      } else if (j == 0) {
-        if (k == -1) {
-          neighborIndices_h(0, 1, 3) = n;
-          neighborIndices_h(0, 2, 3) = n;
-        } else if (k == 0) {
-          neighborIndices_h(1, 1, 3) = n;
-          neighborIndices_h(1, 2, 3) = n;
-          neighborIndices_h(2, 1, 3) = n;
-          neighborIndices_h(2, 2, 3) = n;
-        } else if (k == 1) {
-          neighborIndices_h(3, 1, 3) = n;
-          neighborIndices_h(3, 2, 3) = n;
-        }
-      } else if (j == 1) {
-        if (k == -1) {
-          neighborIndices_h(0, 3, 3) = n;
-        } else if (k == 0) {
-          neighborIndices_h(1, 3, 3) = n;
-          neighborIndices_h(2, 3, 3) = n;
-        } else if (k == 1) {
-          neighborIndices_h(3, 3, 3) = n;
-        }
-      }
-    }
-  }
-
-  neighborIndices_.DeepCopy(neighborIndices_h);
-}
-
-void Swarm::SetupPersistentMPI() {
-  auto pmb = GetBlockPointer();
-  vbswarm->SetupPersistentMPI();
-
-  const int ndim = pmb->pmy_mesh->ndim;
-
-  const int nbmax = vbswarm->bd_var_.nbmax;
-  num_particles_to_send_ = ParArrayND<int>("npts", nbmax);
-
-  // Build up convenience array of neighbor indices
-  if (ndim == 1) {
-    SetNeighborIndices1D_();
-  } else if (ndim == 2) {
-    SetNeighborIndices2D_();
-  } else if (ndim == 3) {
-    SetNeighborIndices3D_();
-  } else {
-    PARTHENON_FAIL("ndim must be 1, 2, or 3 for particles!");
-  }
-
-  neighbor_received_particles_.resize(nbmax);
-
-  // Build device array mapping neighbor index to neighbor bufid
-  if (pmb->pbval->nneighbor > 0) {
-    ParArrayND<int> neighbor_buffer_index("Neighbor buffer index", pmb->pbval->nneighbor);
-    auto neighbor_buffer_index_h = neighbor_buffer_index.GetHostMirror();
-    for (int n = 0; n < pmb->pbval->nneighbor; n++) {
-      neighbor_buffer_index_h(n) = pmb->pbval->neighbor[n].bufid;
-    }
-    neighbor_buffer_index.DeepCopy(neighbor_buffer_index_h);
-    neighbor_buffer_index_ = neighbor_buffer_index;
-  }
-}
-
-int Swarm::CountParticlesToSend_() {
-  auto blockIndex_h = blockIndex_.GetHostMirrorAndCopy();
-  auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
-  auto swarm_d = GetDeviceContext();
-  auto pmb = GetBlockPointer();
-  const int nbmax = vbswarm->bd_var_.nbmax;
-
-  // Fence to make sure particles aren't currently being transported locally
-  pmb->exec_space.fence();
-  auto num_particles_to_send_h = num_particles_to_send_.GetHostMirror();
-  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
-    num_particles_to_send_h(n) = 0;
-  }
-  const int particle_size = GetParticleDataSize();
-  vbswarm->particle_size = particle_size;
-
-  int max_indices_size = 0;
-  int total_noblock_particles = 0;
-  for (int n = 0; n <= max_active_index_; n++) {
-    if (mask_h(n)) {
-      // This particle should be sent
-      if (blockIndex_h(n) >= 0) {
-        num_particles_to_send_h(blockIndex_h(n))++;
-        if (max_indices_size < num_particles_to_send_h(blockIndex_h(n))) {
-          max_indices_size = num_particles_to_send_h(blockIndex_h(n));
-        }
-      }
-      if (blockIndex_h(n) == no_block_) {
-        total_noblock_particles++;
-      }
-    }
-  }
-  // Size-0 arrays not permitted but we don't want to short-circuit subsequent logic that
-  // indicates completed communications
-  max_indices_size = std::max<int>(1, max_indices_size);
-
-  // Not a ragged-right array, just for convenience
-  if (total_noblock_particles > 0) {
-    auto noblock_indices =
-        ParArrayND<int>("Particles with no block", total_noblock_particles);
-    auto noblock_indices_h = noblock_indices.GetHostMirror();
-    int counter = 0;
-    for (int n = 0; n <= max_active_index_; n++) {
-      if (mask_h(n)) {
-        if (blockIndex_h(n) == no_block_) {
-          noblock_indices_h(counter) = n;
-          counter++;
-        }
-      }
-    }
-    noblock_indices.DeepCopy(noblock_indices_h);
-    ApplyBoundaries_(total_noblock_particles, noblock_indices);
-  }
-
-  particle_indices_to_send_ =
-      ParArrayND<int>("Particle indices to send", nbmax, max_indices_size);
-  auto particle_indices_to_send_h = particle_indices_to_send_.GetHostMirror();
-  std::vector<int> counter(nbmax, 0);
-  for (int n = 0; n <= max_active_index_; n++) {
-    if (mask_h(n)) {
-      if (blockIndex_h(n) >= 0) {
-        particle_indices_to_send_h(blockIndex_h(n), counter[blockIndex_h(n)]) = n;
-        counter[blockIndex_h(n)]++;
-      }
-    }
-  }
-  num_particles_to_send_.DeepCopy(num_particles_to_send_h);
-  particle_indices_to_send_.DeepCopy(particle_indices_to_send_h);
-
-  num_particles_sent_ = 0;
-  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
-    // Resize buffer if too small
-    const int bufid = pmb->pbval->neighbor[n].bufid;
-    auto sendbuf = vbswarm->bd_var_.send[bufid];
-    if (sendbuf.extent(0) < num_particles_to_send_h(n) * particle_size) {
-      sendbuf = BufArray1D<Real>("Buffer", num_particles_to_send_h(n) * particle_size);
-      vbswarm->bd_var_.send[bufid] = sendbuf;
-    }
-    vbswarm->send_size[bufid] = num_particles_to_send_h(n) * particle_size;
-    num_particles_sent_ += num_particles_to_send_h(n);
-  }
-
-  return max_indices_size;
-}
-
-void Swarm::LoadBuffers_(const int max_indices_size) {
-  auto swarm_d = GetDeviceContext();
-  auto pmb = GetBlockPointer();
-  const int particle_size = GetParticleDataSize();
-  const int nneighbor = pmb->pbval->nneighbor;
-
-  auto &intVector_ = std::get<getType<int>()>(Vectors_);
-  auto &realVector_ = std::get<getType<Real>()>(Vectors_);
-  PackIndexMap real_imap;
-  PackIndexMap int_imap;
-  auto vreal = PackAllVariables_<Real>(real_imap);
-  auto vint = PackAllVariables_<int>(int_imap);
-  const int realPackDim = vreal.GetDim(2);
-  const int intPackDim = vint.GetDim(2);
-
-  // Pack index:
-  // [variable start] [swarm idx]
-
-  auto &bdvar = vbswarm->bd_var_;
-  auto num_particles_to_send = num_particles_to_send_;
-  auto particle_indices_to_send = particle_indices_to_send_;
-  auto neighbor_buffer_index = neighbor_buffer_index_;
-  pmb->par_for(
-      "Pack Buffers", 0, max_indices_size - 1,
-      KOKKOS_LAMBDA(const int n) {            // Max index
-        for (int m = 0; m < nneighbor; m++) { // Number of neighbors
-          const int bufid = neighbor_buffer_index(m);
-          if (n < num_particles_to_send(m)) {
-            const int sidx = particle_indices_to_send(m, n);
-            int buffer_index = n * particle_size;
-            swarm_d.MarkParticleForRemoval(sidx);
-            for (int i = 0; i < realPackDim; i++) {
-              bdvar.send[bufid](buffer_index) = vreal(i, sidx);
-              buffer_index++;
-            }
-            for (int i = 0; i < intPackDim; i++) {
-              bdvar.send[bufid](buffer_index) = static_cast<Real>(vint(i, sidx));
-              buffer_index++;
-            }
+            cell_sorted_number(k, j, i) = number;
           }
         }
       });
-
-  RemoveMarkedParticles();
-}
-
-void Swarm::Send(BoundaryCommSubset phase) {
-  auto pmb = GetBlockPointer();
-  const int nneighbor = pmb->pbval->nneighbor;
-  auto swarm_d = GetDeviceContext();
-
-  if (nneighbor == 0) {
-    // Process physical boundary conditions on "sent" particles
-    auto blockIndex_h = blockIndex_.GetHostMirrorAndCopy();
-    auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
-
-    int total_sent_particles = 0;
-    pmb->par_reduce(
-        "total sent particles", 0, max_active_index_,
-        KOKKOS_LAMBDA(int n, int &total_sent_particles) {
-          if (swarm_d.IsActive(n)) {
-            if (!swarm_d.IsOnCurrentMeshBlock(n)) {
-              total_sent_particles++;
-            }
-          }
-        },
-        Kokkos::Sum<int>(total_sent_particles));
-
-    if (total_sent_particles > 0) {
-      ParArrayND<int> new_indices("new indices", total_sent_particles);
-      auto new_indices_h = new_indices.GetHostMirrorAndCopy();
-      int sent_particle_index = 0;
-      for (int n = 0; n <= max_active_index_; n++) {
-        if (mask_h(n)) {
-          if (blockIndex_h(n) >= 0 || blockIndex_h(n) == no_block_) {
-            new_indices_h(sent_particle_index) = n;
-            sent_particle_index++;
-          }
-        }
-      }
-      new_indices.DeepCopy(new_indices_h);
-
-      ApplyBoundaries_(total_sent_particles, new_indices);
-    }
-  } else {
-    // Query particles for those to be sent
-    int max_indices_size = CountParticlesToSend_();
-
-    // Prepare buffers for send operations
-    LoadBuffers_(max_indices_size);
-
-    // Send buffer data
-    vbswarm->Send(phase);
-  }
-}
-
-void Swarm::CountReceivedParticles_() {
-  auto pmb = GetBlockPointer();
-  total_received_particles_ = 0;
-  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
-    const int bufid = pmb->pbval->neighbor[n].bufid;
-    if (vbswarm->bd_var_.flag[bufid] == BoundaryStatus::arrived) {
-      PARTHENON_DEBUG_REQUIRE(vbswarm->recv_size[bufid] % vbswarm->particle_size == 0,
-                              "Receive buffer is not divisible by particle size!");
-      neighbor_received_particles_[n] =
-          vbswarm->recv_size[bufid] / vbswarm->particle_size;
-      total_received_particles_ += neighbor_received_particles_[n];
-    } else {
-      neighbor_received_particles_[n] = 0;
-    }
-  }
-}
-
-void Swarm::UpdateNeighborBufferReceiveIndices_(ParArrayND<int> &neighbor_index,
-                                                ParArrayND<int> &buffer_index) {
-  auto pmb = GetBlockPointer();
-  auto neighbor_index_h = neighbor_index.GetHostMirror();
-  auto buffer_index_h =
-      buffer_index.GetHostMirror(); // Index of each particle in its received buffer
-
-  int id = 0;
-  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
-    for (int m = 0; m < neighbor_received_particles_[n]; m++) {
-      neighbor_index_h(id) = n;
-      buffer_index_h(id) = m;
-      id++;
-    }
-  }
-  neighbor_index.DeepCopy(neighbor_index_h);
-  buffer_index.DeepCopy(buffer_index_h);
-}
-
-void Swarm::UnloadBuffers_() {
-  auto pmb = GetBlockPointer();
-
-  CountReceivedParticles_();
-
-  auto &bdvar = vbswarm->bd_var_;
-
-  if (total_received_particles_ > 0) {
-    ParArrayND<int> new_indices;
-    auto new_mask = AddEmptyParticles(total_received_particles_, new_indices);
-
-    ParArrayND<int> neighbor_index("Neighbor index", total_received_particles_);
-    ParArrayND<int> buffer_index("Buffer index", total_received_particles_);
-    UpdateNeighborBufferReceiveIndices_(neighbor_index, buffer_index);
-    auto neighbor_buffer_index = neighbor_buffer_index_;
-
-    auto &intVector_ = std::get<getType<int>()>(Vectors_);
-    auto &realVector_ = std::get<getType<Real>()>(Vectors_);
-    PackIndexMap real_imap;
-    PackIndexMap int_imap;
-    auto vreal = PackAllVariables_<Real>(real_imap);
-    auto vint = PackAllVariables_<int>(int_imap);
-    int realPackDim = vreal.GetDim(2);
-    int intPackDim = vint.GetDim(2);
-
-    // construct map from buffer index to swarm index (or just return vector of indices!)
-    const int particle_size = GetParticleDataSize();
-    auto swarm_d = GetDeviceContext();
-
-    pmb->par_for(
-        "Unload buffers", 0, total_received_particles_ - 1, KOKKOS_LAMBDA(const int n) {
-          const int sid = new_indices(n);
-          const int nid = neighbor_index(n);
-          int bid = buffer_index(n) * particle_size;
-          const int nbid = neighbor_buffer_index(nid);
-          for (int i = 0; i < realPackDim; i++) {
-            vreal(i, sid) = bdvar.recv[nbid](bid);
-            bid++;
-          }
-          for (int i = 0; i < intPackDim; i++) {
-            vint(i, sid) = static_cast<int>(bdvar.recv[nbid](bid));
-            bid++;
-          }
-        });
-
-    ApplyBoundaries_(total_received_particles_, new_indices);
-  }
-}
-
-void Swarm::ApplyBoundaries_(const int nparticles, ParArrayND<int> indices) {
-  auto pmb = GetBlockPointer();
-  auto &x = Get<Real>("x").Get();
-  auto &y = Get<Real>("y").Get();
-  auto &z = Get<Real>("z").Get();
-  auto swarm_d = GetDeviceContext();
-  auto bcs = this->bounds_d;
-
-  pmb->par_for(
-      "Swarm::ApplyBoundaries", 0, nparticles - 1, KOKKOS_LAMBDA(const int n) {
-        const int sid = indices(n);
-        for (int l = 0; l < 6; l++) {
-          bcs.bounds[l]->Apply(sid, x(sid), y(sid), z(sid), swarm_d);
-        }
-      });
-
-  RemoveMarkedParticles();
-}
-
-bool Swarm::Receive(BoundaryCommSubset phase) {
-  auto pmb = GetBlockPointer();
-  const int nneighbor = pmb->pbval->nneighbor;
-
-  if (nneighbor == 0) {
-    // Do nothing; no boundaries to receive
-    return true;
-  } else {
-    // Ensure all local deep copies marked BoundaryStatus::completed are actually received
-    pmb->exec_space.fence();
-
-    // Populate buffers
-    vbswarm->Receive(phase);
-
-    // Transfer data from buffers to swarm memory pool
-    UnloadBuffers_();
-
-    auto &bdvar = vbswarm->bd_var_;
-    bool all_boundaries_received = true;
-    for (int n = 0; n < nneighbor; n++) {
-      NeighborBlock &nb = pmb->pbval->neighbor[n];
-      if (bdvar.flag[nb.bufid] == BoundaryStatus::arrived) {
-        bdvar.flag[nb.bufid] = BoundaryStatus::completed;
-      } else if (bdvar.flag[nb.bufid] == BoundaryStatus::waiting) {
-        all_boundaries_received = false;
-      }
-    }
-
-    return all_boundaries_received;
-  }
-}
-
-void Swarm::ResetCommunication() {
-  auto pmb = GetBlockPointer();
-#ifdef MPI_PARALLEL
-  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
-    NeighborBlock &nb = pmb->pbval->neighbor[n];
-    vbswarm->bd_var_.req_send[nb.bufid] = MPI_REQUEST_NULL;
-  }
-#endif
-
-  // Reset boundary statuses
-  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
-    auto &nb = pmb->pbval->neighbor[n];
-    vbswarm->bd_var_.flag[nb.bufid] = BoundaryStatus::waiting;
-  }
-}
-
-bool Swarm::FinalizeCommunicationIterative() {
-  PARTHENON_THROW("FinalizeCommunicationIterative not yet implemented!");
-  return true;
-}
-
-void Swarm::AllocateComms(std::weak_ptr<MeshBlock> wpmb) {
-  if (wpmb.expired()) return;
-
-  std::shared_ptr<MeshBlock> pmb = wpmb.lock();
-
-  // Create the boundary object
-  vbswarm = std::make_shared<BoundarySwarm>(pmb, label_);
-
-  // Enroll SwarmVariable object
-  vbswarm->bswarm_index = pmb->pbswarm->bswarms.size();
-  pmb->pbswarm->bswarms.push_back(vbswarm);
 }
 
 } // namespace parthenon
diff --git a/src/interface/swarm.hpp b/src/interface/swarm.hpp
index 041fbfb86d38..54864704bb10 100644
--- a/src/interface/swarm.hpp
+++ b/src/interface/swarm.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -47,6 +47,31 @@ struct BoundaryDeviceContext {
   ParticleBound *bounds[6];
 };
 
+// This class is returned by AddEmptyParticles. It provides accessors to the new particle
+// memory by wrapping the persistent new_indices_ array.
+class NewParticlesContext {
+ public:
+  NewParticlesContext(const int new_indices_max_idx, const ParArray1D<int> new_indices)
+      : new_indices_max_idx_(new_indices_max_idx), new_indices_(new_indices) {}
+
+  // Return the maximum index of the contiguous block of new particle indices.
+  KOKKOS_INLINE_FUNCTION
+  int GetNewParticlesMaxIndex() const { return new_indices_max_idx_; }
+
+  // Given an index n into the contiguous block of new particle indices, return the swarm
+  // index of the new particle.
+  KOKKOS_INLINE_FUNCTION
+  int GetNewParticleIndex(const int n) const {
+    PARTHENON_DEBUG_REQUIRE(n >= 0 && n <= new_indices_max_idx_,
+                            "New particle index is out of bounds!");
+    return new_indices_(n);
+  }
+
+ private:
+  const int new_indices_max_idx_;
+  ParArray1D<int> new_indices_;
+};
+
 class MeshBlock;
 
 enum class PARTICLE_STATUS { UNALLOCATED, ALIVE, DEAD };
@@ -112,18 +137,18 @@ class Swarm {
   /// Get particle variable
   template <typename T>
   bool Contains(const std::string &label) {
-    return std::get<getType<T>()>(Maps_).count(label);
+    return std::get<getType<T>()>(maps_).count(label);
   }
   // TODO(JMM): Kind of sucks to have two Gets here.
   // Ben could we remove the get reference one and always get a
   // pointer?
   template <class T>
   ParticleVariable<T> &Get(const std::string &label) {
-    return *std::get<getType<T>()>(Maps_).at(label);
+    return *std::get<getType<T>()>(maps_).at(label);
   }
   template <class T>
   std::shared_ptr<ParticleVariable<T>> GetP(const std::string &label) const {
-    return std::get<getType<T>()>(Maps_).at(label);
+    return std::get<getType<T>()>(maps_).at(label);
   }
 
   /// Assign label for swarm
@@ -170,7 +195,7 @@ class Swarm {
   void RemoveMarkedParticles();
 
   /// Open up memory for new empty particles, return a mask to these particles
-  ParArray1D<bool> AddEmptyParticles(const int num_to_add, ParArrayND<int> &new_indices);
+  NewParticlesContext AddEmptyParticles(const int num_to_add);
 
   /// Defragment the list by moving active particles so they are contiguous in
   /// memory
@@ -189,10 +214,10 @@ class Swarm {
   // integers are cast as Reals.
   int GetParticleDataSize() {
     int size = 0;
-    for (auto &v : std::get<0>(Vectors_)) {
+    for (auto &v : std::get<0>(vectors_)) {
       size += v->NumComponents();
     }
-    for (auto &v : std::get<1>(Vectors_)) {
+    for (auto &v : std::get<1>(vectors_)) {
       size += v->NumComponents();
     }
 
@@ -222,13 +247,13 @@ class Swarm {
   void LoadBuffers_(const int max_indices_size);
   void UnloadBuffers_();
 
-  void ApplyBoundaries_(const int nparticles, ParArrayND<int> indices);
+  void ApplyBoundaries_(const int nparticles, ParArray1D<int> indices);
 
   std::unique_ptr<ParticleBound, DeviceDeleter<parthenon::DevMemSpace>> bounds_uptrs[6];
 
   template <typename T>
   const auto &GetVariableVector() const {
-    return std::get<getType<T>()>(Vectors_);
+    return std::get<getType<T>()>(vectors_);
   }
 
  private:
@@ -246,8 +271,8 @@ class Swarm {
 
   int CountParticlesToSend_();
   void CountReceivedParticles_();
-  void UpdateNeighborBufferReceiveIndices_(ParArrayND<int> &neighbor_index,
-                                           ParArrayND<int> &buffer_index);
+  void UpdateNeighborBufferReceiveIndices_(ParArray1D<int> &neighbor_index,
+                                           ParArray1D<int> &buffer_index);
 
   template <class T>
   SwarmVariablePack<T> PackAllVariables_(PackIndexMap &vmap);
@@ -264,23 +289,29 @@ class Swarm {
   Metadata m_;
   int nmax_pool_;
   std::string info_;
-  std::shared_ptr<ParArrayND<PARTICLE_STATUS>> pstatus_;
-  std::tuple<ParticleVariableVector<int>, ParticleVariableVector<Real>> Vectors_;
+  std::tuple<ParticleVariableVector<int>, ParticleVariableVector<Real>> vectors_;
 
-  std::tuple<MapToParticle<int>, MapToParticle<Real>> Maps_;
+  std::tuple<MapToParticle<int>, MapToParticle<Real>> maps_;
 
   std::list<int> free_indices_;
   ParArray1D<bool> mask_;
   ParArray1D<bool> marked_for_removal_;
-  ParArrayND<int> blockIndex_; // Neighbor index for each particle. -1 for current block.
-  ParArrayND<int> neighborIndices_; // Indexing of vbvar's neighbor array. -1 for same.
-                                    // k,j indices unused in 3D&2D, 2D, respectively
+  ParArrayND<int> block_index_; // Neighbor index for each particle. -1 for current block.
+  ParArrayND<int> neighbor_indices_; // Indexing of vbvar's neighbor array. -1 for same.
+                                     // k,j indices unused in 3D&2D, 2D, respectively
+  ParArray1D<int> new_indices_;     // Persistent array that provides the new indices when
+                                    // AddEmptyParticles is called. Always defragmented.
+  int new_indices_max_idx_;         // Maximum valid index of new_indices_ array.
+  ParArray1D<int> from_to_indices_; // Array used for sorting particles during defragment
+                                    // step (size nmax_pool + 1).
+  ParArray1D<int> recv_neighbor_index_; // Neighbor indices for received particles
+  ParArray1D<int> recv_buffer_index_;   // Buffer indices for received particles
 
   constexpr static int no_block_ = -2;
   constexpr static int this_block_ = -1;
   constexpr static int unset_index_ = -1;
 
-  ParArrayND<int> num_particles_to_send_;
+  ParArray1D<int> num_particles_to_send_;
   ParArrayND<int> particle_indices_to_send_;
 
   std::vector<int> neighbor_received_particles_;
@@ -289,11 +320,13 @@ class Swarm {
   ParArrayND<int> neighbor_buffer_index_; // Map from neighbor index to neighbor bufid
 
   ParArray1D<SwarmKey>
-      cellSorted_; // 1D per-cell sorted array of key-value swarm memory indices
+      cell_sorted_; // 1D per-cell sorted array of key-value swarm memory indices
 
-  ParArrayND<int> cellSortedBegin_; // Per-cell array of starting indices in cell_sorted_
+  ParArrayND<int>
+      cell_sorted_begin_; // Per-cell array of starting indices in cell_sorted_
 
-  ParArrayND<int> cellSortedNumber_; // Per-cell array of number of particles in each cell
+  ParArrayND<int>
+      cell_sorted_number_; // Per-cell array of number of particles in each cell
 
  public:
   bool mpiStatus;
@@ -303,7 +336,7 @@ template <class T>
 inline vpack_types::SwarmVarList<T>
 Swarm::MakeVarList_(const std::vector<std::string> &names) {
   vpack_types::SwarmVarList<T> vars;
-  auto variables = std::get<getType<T>()>(Maps_);
+  auto variables = std::get<getType<T>()>(maps_);
 
   for (auto name : names) {
     vars.push_front(variables[name]);
@@ -315,7 +348,7 @@ template <class T>
 inline vpack_types::SwarmVarList<T> Swarm::MakeVarListAll_() {
   int size = 0;
   vpack_types::SwarmVarList<T> vars;
-  auto variables = std::get<getType<T>()>(Vectors_);
+  auto variables = std::get<getType<T>()>(vectors_);
   for (auto it = variables.rbegin(); it != variables.rend(); ++it) {
     auto v = *it;
     vars.push_front(v);
@@ -338,8 +371,8 @@ inline SwarmVariablePack<T> Swarm::PackVariables(const std::vector<std::string>
 template <class T>
 inline SwarmVariablePack<T> Swarm::PackAllVariables_(PackIndexMap &vmap) {
   std::vector<std::string> names;
-  names.reserve(std::get<getType<T>()>(Vectors_).size());
-  for (const auto &v : std::get<getType<T>()>(Vectors_)) {
+  names.reserve(std::get<getType<T>()>(vectors_).size());
+  for (const auto &v : std::get<getType<T>()>(vectors_)) {
     names.push_back(v->label());
   }
 
@@ -352,8 +385,8 @@ inline void Swarm::Add_(const std::string &label, const Metadata &m) {
   ParticleVariable<T> pvar(label, nmax_pool_, m);
   auto var = std::make_shared<ParticleVariable<T>>(pvar);
 
-  std::get<getType<T>()>(Vectors_).push_back(var);
-  std::get<getType<T>()>(Maps_)[label] = var;
+  std::get<getType<T>()>(vectors_).push_back(var);
+  std::get<getType<T>()>(maps_)[label] = var;
 }
 
 using SP_Swarm = std::shared_ptr<Swarm>;
diff --git a/src/interface/swarm_comms.cpp b/src/interface/swarm_comms.cpp
new file mode 100644
index 000000000000..1bb33c2f8ea0
--- /dev/null
+++ b/src/interface/swarm_comms.cpp
@@ -0,0 +1,740 @@
+//========================================================================================
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+#include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "mesh/mesh.hpp"
+#include "swarm.hpp"
+#include "utils/error_checking.hpp"
+#include "utils/sort.hpp"
+
+namespace parthenon {
+
+template <class BOutflow, class BPeriodic, int iFace>
+void Swarm::AllocateBoundariesImpl_(MeshBlock *pmb) {
+  std::stringstream msg;
+  auto &bcs = pmb->pmy_mesh->mesh_bcs;
+  if (bcs[iFace] == BoundaryFlag::outflow) {
+    bounds_uptrs[iFace] = DeviceAllocate<BOutflow>();
+  } else if (bcs[iFace] == BoundaryFlag::periodic) {
+    bounds_uptrs[iFace] = DeviceAllocate<BPeriodic>();
+  } else if (bcs[iFace] == BoundaryFlag::user) {
+    if (pmb->pmy_mesh->SwarmBndryFnctn[iFace] != nullptr) {
+      bounds_uptrs[iFace] = pmb->pmy_mesh->SwarmBndryFnctn[iFace]();
+    } else {
+      msg << (iFace % 2 == 0 ? "i" : "o") << "x" << iFace / 2 + 1
+          << " user boundary requested but provided function is null!";
+      PARTHENON_THROW(msg);
+    }
+  } else {
+    msg << (iFace % 2 == 0 ? "i" : "o") << "x" << iFace / 2 + 1 << " boundary flag "
+        << static_cast<int>(bcs[iFace]) << " not supported!";
+    PARTHENON_THROW(msg);
+  }
+}
+
+void Swarm::AllocateBoundaries() {
+  auto pmb = GetBlockPointer();
+  std::stringstream msg;
+
+  auto &bcs = pmb->pmy_mesh->mesh_bcs;
+
+  AllocateBoundariesImpl_<ParticleBoundIX1Outflow, ParticleBoundIX1Periodic, 0>(
+      pmb.get());
+  AllocateBoundariesImpl_<ParticleBoundOX1Outflow, ParticleBoundOX1Periodic, 1>(
+      pmb.get());
+  AllocateBoundariesImpl_<ParticleBoundIX2Outflow, ParticleBoundIX2Periodic, 2>(
+      pmb.get());
+  AllocateBoundariesImpl_<ParticleBoundOX2Outflow, ParticleBoundOX2Periodic, 3>(
+      pmb.get());
+  AllocateBoundariesImpl_<ParticleBoundIX3Outflow, ParticleBoundIX3Periodic, 4>(
+      pmb.get());
+  AllocateBoundariesImpl_<ParticleBoundOX3Outflow, ParticleBoundOX3Periodic, 5>(
+      pmb.get());
+
+  for (int n = 0; n < 6; n++) {
+    bounds_d.bounds[n] = bounds_uptrs[n].get();
+    std::stringstream msg;
+    msg << "Boundary condition on face " << n << " missing.\n"
+        << "Please set it to `outflow`, `periodic`, or `user` in the input deck.\n"
+        << "If you set it to user, you must also manually set "
+        << "the swarm boundary pointer in your application." << std::endl;
+    PARTHENON_REQUIRE(bounds_d.bounds[n] != nullptr, msg);
+  }
+}
+
+///
+/// Routine for precomputing neighbor indices to efficiently compute particle
+/// position in terms of neighbor blocks based on spatial position. See
+/// GetNeighborBlockIndex()
+///
+void Swarm::SetNeighborIndices1D_() {
+  auto pmb = GetBlockPointer();
+  const int ndim = pmb->pmy_mesh->ndim;
+  auto neighbor_indices_h = neighbor_indices_.GetHostMirror();
+
+  // Initialize array in event of zero neighbors
+  for (int k = 0; k < 4; k++) {
+    for (int j = 0; j < 4; j++) {
+      for (int i = 0; i < 4; i++) {
+        neighbor_indices_h(k, j, i) = no_block_;
+      }
+    }
+  }
+
+  // Indicate which neighbor regions correspond to this meshblock
+  const int kmin = 0;
+  const int kmax = 4;
+  const int jmin = 0;
+  const int jmax = 4;
+  const int imin = 1;
+  const int imax = 3;
+  for (int k = kmin; k < kmax; k++) {
+    for (int j = jmin; j < jmax; j++) {
+      for (int i = imin; i < imax; i++) {
+        neighbor_indices_h(k, j, i) = this_block_;
+      }
+    }
+  }
+
+  auto mesh_bcs = pmb->pmy_mesh->mesh_bcs;
+  // Indicate which neighbor regions correspond to each neighbor meshblock
+  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
+    NeighborBlock &nb = pmb->pbval->neighbor[n];
+
+    const int i = nb.ni.ox1;
+
+    if (i == -1) {
+      neighbor_indices_h(0, 0, 0) = n;
+    } else if (i == 0) {
+      neighbor_indices_h(0, 0, 1) = n;
+      neighbor_indices_h(0, 0, 2) = n;
+    } else {
+      neighbor_indices_h(0, 0, 3) = n;
+    }
+  }
+
+  neighbor_indices_.DeepCopy(neighbor_indices_h);
+}
+
+void Swarm::SetNeighborIndices2D_() {
+  auto pmb = GetBlockPointer();
+  const int ndim = pmb->pmy_mesh->ndim;
+  auto neighbor_indices_h = neighbor_indices_.GetHostMirror();
+
+  // Initialize array in event of zero neighbors
+  for (int k = 0; k < 4; k++) {
+    for (int j = 0; j < 4; j++) {
+      for (int i = 0; i < 4; i++) {
+        neighbor_indices_h(k, j, i) = no_block_;
+      }
+    }
+  }
+
+  // Indicate which neighbor regions correspond to this meshblock
+  const int kmin = 0;
+  const int kmax = 4;
+  const int jmin = 1;
+  const int jmax = 3;
+  const int imin = 1;
+  const int imax = 3;
+  for (int k = kmin; k < kmax; k++) {
+    for (int j = jmin; j < jmax; j++) {
+      for (int i = imin; i < imax; i++) {
+        neighbor_indices_h(k, j, i) = this_block_;
+      }
+    }
+  }
+
+  // Indicate which neighbor regions correspond to each neighbor meshblock
+  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
+    NeighborBlock &nb = pmb->pbval->neighbor[n];
+
+    const int i = nb.ni.ox1;
+    const int j = nb.ni.ox2;
+
+    if (i == -1) {
+      if (j == -1) {
+        neighbor_indices_h(0, 0, 0) = n;
+      } else if (j == 0) {
+        neighbor_indices_h(0, 1, 0) = n;
+        neighbor_indices_h(0, 2, 0) = n;
+      } else if (j == 1) {
+        neighbor_indices_h(0, 3, 0) = n;
+      }
+    } else if (i == 0) {
+      if (j == -1) {
+        neighbor_indices_h(0, 0, 1) = n;
+        neighbor_indices_h(0, 0, 2) = n;
+      } else if (j == 1) {
+        neighbor_indices_h(0, 3, 1) = n;
+        neighbor_indices_h(0, 3, 2) = n;
+      }
+    } else if (i == 1) {
+      if (j == -1) {
+        neighbor_indices_h(0, 0, 3) = n;
+      } else if (j == 0) {
+        neighbor_indices_h(0, 1, 3) = n;
+        neighbor_indices_h(0, 2, 3) = n;
+      } else if (j == 1) {
+        neighbor_indices_h(0, 3, 3) = n;
+      }
+    }
+  }
+
+  neighbor_indices_.DeepCopy(neighbor_indices_h);
+}
+
+void Swarm::SetNeighborIndices3D_() {
+  auto pmb = GetBlockPointer();
+  const int ndim = pmb->pmy_mesh->ndim;
+  auto neighbor_indices_h = neighbor_indices_.GetHostMirror();
+
+  // Initialize array in event of zero neighbors
+  for (int k = 0; k < 4; k++) {
+    for (int j = 0; j < 4; j++) {
+      for (int i = 0; i < 4; i++) {
+        neighbor_indices_h(k, j, i) = no_block_;
+      }
+    }
+  }
+
+  // Indicate which neighbor regions correspond to this meshblock
+  const int kmin = 1;
+  const int kmax = 3;
+  const int jmin = 1;
+  const int jmax = 3;
+  const int imin = 1;
+  const int imax = 3;
+  for (int k = kmin; k < kmax; k++) {
+    for (int j = jmin; j < jmax; j++) {
+      for (int i = imin; i < imax; i++) {
+        neighbor_indices_h(k, j, i) = this_block_;
+      }
+    }
+  }
+
+  // Indicate which neighbor regions correspond to each neighbor meshblock
+  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
+    NeighborBlock &nb = pmb->pbval->neighbor[n];
+
+    const int i = nb.ni.ox1;
+    const int j = nb.ni.ox2;
+    const int k = nb.ni.ox3;
+
+    if (i == -1) {
+      if (j == -1) {
+        if (k == -1) {
+          neighbor_indices_h(0, 0, 0) = n;
+        } else if (k == 0) {
+          neighbor_indices_h(1, 0, 0) = n;
+          neighbor_indices_h(2, 0, 0) = n;
+        } else if (k == 1) {
+          neighbor_indices_h(3, 0, 0) = n;
+        }
+      } else if (j == 0) {
+        if (k == -1) {
+          neighbor_indices_h(0, 1, 0) = n;
+          neighbor_indices_h(0, 2, 0) = n;
+        } else if (k == 0) {
+          neighbor_indices_h(1, 1, 0) = n;
+          neighbor_indices_h(1, 2, 0) = n;
+          neighbor_indices_h(2, 1, 0) = n;
+          neighbor_indices_h(2, 2, 0) = n;
+        } else if (k == 1) {
+          neighbor_indices_h(3, 1, 0) = n;
+          neighbor_indices_h(3, 2, 0) = n;
+        }
+      } else if (j == 1) {
+        if (k == -1) {
+          neighbor_indices_h(0, 3, 0) = n;
+        } else if (k == 0) {
+          neighbor_indices_h(1, 3, 0) = n;
+          neighbor_indices_h(2, 3, 0) = n;
+        } else if (k == 1) {
+          neighbor_indices_h(3, 3, 0) = n;
+        }
+      }
+    } else if (i == 0) {
+      if (j == -1) {
+        if (k == -1) {
+          neighbor_indices_h(0, 0, 1) = n;
+          neighbor_indices_h(0, 0, 2) = n;
+        } else if (k == 0) {
+          neighbor_indices_h(1, 0, 1) = n;
+          neighbor_indices_h(1, 0, 2) = n;
+          neighbor_indices_h(2, 0, 1) = n;
+          neighbor_indices_h(2, 0, 2) = n;
+        } else if (k == 1) {
+          neighbor_indices_h(3, 0, 1) = n;
+          neighbor_indices_h(3, 0, 2) = n;
+        }
+      } else if (j == 0) {
+        if (k == -1) {
+          neighbor_indices_h(0, 1, 1) = n;
+          neighbor_indices_h(0, 1, 2) = n;
+          neighbor_indices_h(0, 2, 1) = n;
+          neighbor_indices_h(0, 2, 2) = n;
+        } else if (k == 1) {
+          neighbor_indices_h(3, 1, 1) = n;
+          neighbor_indices_h(3, 1, 2) = n;
+          neighbor_indices_h(3, 2, 1) = n;
+          neighbor_indices_h(3, 2, 2) = n;
+        }
+      } else if (j == 1) {
+        if (k == -1) {
+          neighbor_indices_h(0, 3, 1) = n;
+          neighbor_indices_h(0, 3, 2) = n;
+        } else if (k == 0) {
+          neighbor_indices_h(1, 3, 1) = n;
+          neighbor_indices_h(1, 3, 2) = n;
+          neighbor_indices_h(2, 3, 1) = n;
+          neighbor_indices_h(2, 3, 2) = n;
+        } else if (k == 1) {
+          neighbor_indices_h(3, 3, 1) = n;
+          neighbor_indices_h(3, 3, 2) = n;
+        }
+      }
+    } else if (i == 1) {
+      if (j == -1) {
+        if (k == -1) {
+          neighbor_indices_h(0, 0, 3) = n;
+        } else if (k == 0) {
+          neighbor_indices_h(1, 0, 3) = n;
+          neighbor_indices_h(2, 0, 3) = n;
+        } else if (k == 1) {
+          neighbor_indices_h(3, 0, 3) = n;
+        }
+      } else if (j == 0) {
+        if (k == -1) {
+          neighbor_indices_h(0, 1, 3) = n;
+          neighbor_indices_h(0, 2, 3) = n;
+        } else if (k == 0) {
+          neighbor_indices_h(1, 1, 3) = n;
+          neighbor_indices_h(1, 2, 3) = n;
+          neighbor_indices_h(2, 1, 3) = n;
+          neighbor_indices_h(2, 2, 3) = n;
+        } else if (k == 1) {
+          neighbor_indices_h(3, 1, 3) = n;
+          neighbor_indices_h(3, 2, 3) = n;
+        }
+      } else if (j == 1) {
+        if (k == -1) {
+          neighbor_indices_h(0, 3, 3) = n;
+        } else if (k == 0) {
+          neighbor_indices_h(1, 3, 3) = n;
+          neighbor_indices_h(2, 3, 3) = n;
+        } else if (k == 1) {
+          neighbor_indices_h(3, 3, 3) = n;
+        }
+      }
+    }
+  }
+
+  neighbor_indices_.DeepCopy(neighbor_indices_h);
+}
+
+void Swarm::SetupPersistentMPI() {
+  auto pmb = GetBlockPointer();
+  vbswarm->SetupPersistentMPI();
+
+  const int ndim = pmb->pmy_mesh->ndim;
+
+  const int nbmax = vbswarm->bd_var_.nbmax;
+
+  // Build up convenience array of neighbor indices
+  if (ndim == 1) {
+    SetNeighborIndices1D_();
+  } else if (ndim == 2) {
+    SetNeighborIndices2D_();
+  } else if (ndim == 3) {
+    SetNeighborIndices3D_();
+  } else {
+    PARTHENON_FAIL("ndim must be 1, 2, or 3 for particles!");
+  }
+
+  neighbor_received_particles_.resize(nbmax);
+
+  // Build device array mapping neighbor index to neighbor bufid
+  if (pmb->pbval->nneighbor > 0) {
+    ParArrayND<int> neighbor_buffer_index("Neighbor buffer index", pmb->pbval->nneighbor);
+    auto neighbor_buffer_index_h = neighbor_buffer_index.GetHostMirror();
+    for (int n = 0; n < pmb->pbval->nneighbor; n++) {
+      neighbor_buffer_index_h(n) = pmb->pbval->neighbor[n].bufid;
+    }
+    neighbor_buffer_index.DeepCopy(neighbor_buffer_index_h);
+    neighbor_buffer_index_ = neighbor_buffer_index;
+  }
+}
+
+int Swarm::CountParticlesToSend_() {
+  auto block_index_h = block_index_.GetHostMirrorAndCopy();
+  auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
+  auto swarm_d = GetDeviceContext();
+  auto pmb = GetBlockPointer();
+  const int nbmax = vbswarm->bd_var_.nbmax;
+
+  // Fence to make sure particles aren't currently being transported locally
+  // TODO(BRR) do this operation on device.
+  pmb->exec_space.fence();
+  auto num_particles_to_send_h = num_particles_to_send_.GetHostMirror();
+  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
+    num_particles_to_send_h(n) = 0;
+  }
+  const int particle_size = GetParticleDataSize();
+  vbswarm->particle_size = particle_size;
+
+  int max_indices_size = 0;
+  int total_noblock_particles = 0;
+  for (int n = 0; n <= max_active_index_; n++) {
+    if (mask_h(n)) {
+      // This particle should be sent
+      if (block_index_h(n) >= 0) {
+        num_particles_to_send_h(block_index_h(n))++;
+        if (max_indices_size < num_particles_to_send_h(block_index_h(n))) {
+          max_indices_size = num_particles_to_send_h(block_index_h(n));
+        }
+      }
+      if (block_index_h(n) == no_block_) {
+        total_noblock_particles++;
+      }
+    }
+  }
+  // Size-0 arrays not permitted but we don't want to short-circuit subsequent logic
+  // that indicates completed communications
+  max_indices_size = std::max<int>(1, max_indices_size);
+
+  // Not a ragged-right array, just for convenience
+  if (total_noblock_particles > 0) {
+    auto noblock_indices =
+        ParArray1D<int>("Particles with no block", total_noblock_particles);
+    auto noblock_indices_h = noblock_indices.GetHostMirror();
+    int counter = 0;
+    for (int n = 0; n <= max_active_index_; n++) {
+      if (mask_h(n)) {
+        if (block_index_h(n) == no_block_) {
+          noblock_indices_h(counter) = n;
+          counter++;
+        }
+      }
+    }
+    noblock_indices.DeepCopy(noblock_indices_h);
+    ApplyBoundaries_(total_noblock_particles, noblock_indices);
+  }
+
+  // TODO(BRR) don't allocate dynamically
+  particle_indices_to_send_ =
+      ParArrayND<int>("Particle indices to send", nbmax, max_indices_size);
+  auto particle_indices_to_send_h = particle_indices_to_send_.GetHostMirror();
+  std::vector<int> counter(nbmax, 0);
+  for (int n = 0; n <= max_active_index_; n++) {
+    if (mask_h(n)) {
+      if (block_index_h(n) >= 0) {
+        particle_indices_to_send_h(block_index_h(n), counter[block_index_h(n)]) = n;
+        counter[block_index_h(n)]++;
+      }
+    }
+  }
+  num_particles_to_send_.DeepCopy(num_particles_to_send_h);
+  particle_indices_to_send_.DeepCopy(particle_indices_to_send_h);
+
+  num_particles_sent_ = 0;
+  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
+    // Resize buffer if too small
+    const int bufid = pmb->pbval->neighbor[n].bufid;
+    auto sendbuf = vbswarm->bd_var_.send[bufid];
+    if (sendbuf.extent(0) < num_particles_to_send_h(n) * particle_size) {
+      sendbuf = BufArray1D<Real>("Buffer", num_particles_to_send_h(n) * particle_size);
+      vbswarm->bd_var_.send[bufid] = sendbuf;
+    }
+    vbswarm->send_size[bufid] = num_particles_to_send_h(n) * particle_size;
+    num_particles_sent_ += num_particles_to_send_h(n);
+  }
+
+  return max_indices_size;
+}
+
+void Swarm::LoadBuffers_(const int max_indices_size) {
+  auto swarm_d = GetDeviceContext();
+  auto pmb = GetBlockPointer();
+  const int particle_size = GetParticleDataSize();
+  const int nneighbor = pmb->pbval->nneighbor;
+
+  auto &int_vector = std::get<getType<int>()>(vectors_);
+  auto &real_vector = std::get<getType<Real>()>(vectors_);
+  PackIndexMap real_imap;
+  PackIndexMap int_imap;
+  auto vreal = PackAllVariables_<Real>(real_imap);
+  auto vint = PackAllVariables_<int>(int_imap);
+  const int realPackDim = vreal.GetDim(2);
+  const int intPackDim = vint.GetDim(2);
+
+  // Pack index:
+  // [variable start] [swarm idx]
+
+  auto &bdvar = vbswarm->bd_var_;
+  auto num_particles_to_send = num_particles_to_send_;
+  auto particle_indices_to_send = particle_indices_to_send_;
+  auto neighbor_buffer_index = neighbor_buffer_index_;
+  pmb->par_for(
+      PARTHENON_AUTO_LABEL, 0, max_indices_size - 1,
+      KOKKOS_LAMBDA(const int n) {            // Max index
+        for (int m = 0; m < nneighbor; m++) { // Number of neighbors
+          const int bufid = neighbor_buffer_index(m);
+          if (n < num_particles_to_send(m)) {
+            const int sidx = particle_indices_to_send(m, n);
+            int buffer_index = n * particle_size;
+            swarm_d.MarkParticleForRemoval(sidx);
+            for (int i = 0; i < realPackDim; i++) {
+              bdvar.send[bufid](buffer_index) = vreal(i, sidx);
+              buffer_index++;
+            }
+            for (int i = 0; i < intPackDim; i++) {
+              bdvar.send[bufid](buffer_index) = static_cast<Real>(vint(i, sidx));
+              buffer_index++;
+            }
+          }
+        }
+      });
+
+  RemoveMarkedParticles();
+}
+
+void Swarm::Send(BoundaryCommSubset phase) {
+  auto pmb = GetBlockPointer();
+  const int nneighbor = pmb->pbval->nneighbor;
+  auto swarm_d = GetDeviceContext();
+
+  if (nneighbor == 0) {
+    // Process physical boundary conditions on "sent" particles
+    auto block_index_h = block_index_.GetHostMirrorAndCopy();
+    auto mask_h = Kokkos::create_mirror_view_and_copy(HostMemSpace(), mask_);
+
+    int total_sent_particles = 0;
+    pmb->par_reduce(
+        PARTHENON_AUTO_LABEL, 0, max_active_index_,
+        KOKKOS_LAMBDA(int n, int &total_sent_particles) {
+          if (swarm_d.IsActive(n)) {
+            if (!swarm_d.IsOnCurrentMeshBlock(n)) {
+              total_sent_particles++;
+            }
+          }
+        },
+        Kokkos::Sum<int>(total_sent_particles));
+
+    if (total_sent_particles > 0) {
+      ParArray1D<int> new_indices("new indices", total_sent_particles);
+      auto new_indices_h = new_indices.GetHostMirrorAndCopy();
+      int sent_particle_index = 0;
+      for (int n = 0; n <= max_active_index_; n++) {
+        if (mask_h(n)) {
+          if (block_index_h(n) >= 0 || block_index_h(n) == no_block_) {
+            new_indices_h(sent_particle_index) = n;
+            sent_particle_index++;
+          }
+        }
+      }
+      new_indices.DeepCopy(new_indices_h);
+
+      ApplyBoundaries_(total_sent_particles, new_indices);
+    }
+  } else {
+    // Query particles for those to be sent
+    int max_indices_size = CountParticlesToSend_();
+
+    // Prepare buffers for send operations
+    LoadBuffers_(max_indices_size);
+
+    // Send buffer data
+    vbswarm->Send(phase);
+  }
+}
+
+void Swarm::CountReceivedParticles_() {
+  auto pmb = GetBlockPointer();
+  total_received_particles_ = 0;
+  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
+    const int bufid = pmb->pbval->neighbor[n].bufid;
+    if (vbswarm->bd_var_.flag[bufid] == BoundaryStatus::arrived) {
+      PARTHENON_DEBUG_REQUIRE(vbswarm->recv_size[bufid] % vbswarm->particle_size == 0,
+                              "Receive buffer is not divisible by particle size!");
+      neighbor_received_particles_[n] =
+          vbswarm->recv_size[bufid] / vbswarm->particle_size;
+      total_received_particles_ += neighbor_received_particles_[n];
+    } else {
+      neighbor_received_particles_[n] = 0;
+    }
+  }
+}
+
+void Swarm::UpdateNeighborBufferReceiveIndices_(ParArray1D<int> &neighbor_index,
+                                                ParArray1D<int> &buffer_index) {
+  auto pmb = GetBlockPointer();
+  auto neighbor_index_h = neighbor_index.GetHostMirror();
+  auto buffer_index_h =
+      buffer_index.GetHostMirror(); // Index of each particle in its received buffer
+
+  int id = 0;
+  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
+    for (int m = 0; m < neighbor_received_particles_[n]; m++) {
+      neighbor_index_h(id) = n;
+      buffer_index_h(id) = m;
+      id++;
+    }
+  }
+  neighbor_index.DeepCopy(neighbor_index_h);
+  buffer_index.DeepCopy(buffer_index_h);
+}
+
+void Swarm::UnloadBuffers_() {
+  auto pmb = GetBlockPointer();
+
+  CountReceivedParticles_();
+
+  auto &bdvar = vbswarm->bd_var_;
+
+  if (total_received_particles_ > 0) {
+    auto newParticlesContext = AddEmptyParticles(total_received_particles_);
+
+    auto &recv_neighbor_index = recv_neighbor_index_;
+    auto &recv_buffer_index = recv_buffer_index_;
+    UpdateNeighborBufferReceiveIndices_(recv_neighbor_index, recv_buffer_index);
+    auto neighbor_buffer_index = neighbor_buffer_index_;
+
+    auto &int_vector = std::get<getType<int>()>(vectors_);
+    auto &real_vector = std::get<getType<Real>()>(vectors_);
+    PackIndexMap real_imap;
+    PackIndexMap int_imap;
+    auto vreal = PackAllVariables_<Real>(real_imap);
+    auto vint = PackAllVariables_<int>(int_imap);
+    int realPackDim = vreal.GetDim(2);
+    int intPackDim = vint.GetDim(2);
+
+    // construct map from buffer index to swarm index (or just return vector of
+    // indices!)
+    const int particle_size = GetParticleDataSize();
+    auto swarm_d = GetDeviceContext();
+
+    pmb->par_for(
+        PARTHENON_AUTO_LABEL, 0, newParticlesContext.GetNewParticlesMaxIndex(),
+        // n is both new particle index and index over buffer values
+        KOKKOS_LAMBDA(const int n) {
+          const int sid = newParticlesContext.GetNewParticleIndex(n);
+          const int nid = recv_neighbor_index(n);
+          int bid = recv_buffer_index(n) * particle_size;
+          const int nbid = neighbor_buffer_index(nid);
+          for (int i = 0; i < realPackDim; i++) {
+            vreal(i, sid) = bdvar.recv[nbid](bid);
+            bid++;
+          }
+          for (int i = 0; i < intPackDim; i++) {
+            vint(i, sid) = static_cast<int>(bdvar.recv[nbid](bid));
+            bid++;
+          }
+        });
+
+    ApplyBoundaries_(total_received_particles_, new_indices_);
+  }
+}
+
+void Swarm::ApplyBoundaries_(const int nparticles, ParArray1D<int> indices) {
+  auto pmb = GetBlockPointer();
+  auto &x = Get<Real>("x").Get();
+  auto &y = Get<Real>("y").Get();
+  auto &z = Get<Real>("z").Get();
+  auto swarm_d = GetDeviceContext();
+  auto bcs = this->bounds_d;
+
+  pmb->par_for(
+      PARTHENON_AUTO_LABEL, 0, nparticles - 1, KOKKOS_LAMBDA(const int n) {
+        const int sid = indices(n);
+        for (int l = 0; l < 6; l++) {
+          bcs.bounds[l]->Apply(sid, x(sid), y(sid), z(sid), swarm_d);
+        }
+      });
+
+  RemoveMarkedParticles();
+}
+
+bool Swarm::Receive(BoundaryCommSubset phase) {
+  auto pmb = GetBlockPointer();
+  const int nneighbor = pmb->pbval->nneighbor;
+
+  if (nneighbor == 0) {
+    // Do nothing; no boundaries to receive
+    return true;
+  } else {
+    // Ensure all local deep copies marked BoundaryStatus::completed are actually
+    // received
+    pmb->exec_space.fence();
+
+    // Populate buffers
+    vbswarm->Receive(phase);
+
+    // Transfer data from buffers to swarm memory pool
+    UnloadBuffers_();
+
+    auto &bdvar = vbswarm->bd_var_;
+    bool all_boundaries_received = true;
+    for (int n = 0; n < nneighbor; n++) {
+      NeighborBlock &nb = pmb->pbval->neighbor[n];
+      if (bdvar.flag[nb.bufid] == BoundaryStatus::arrived) {
+        bdvar.flag[nb.bufid] = BoundaryStatus::completed;
+      } else if (bdvar.flag[nb.bufid] == BoundaryStatus::waiting) {
+        all_boundaries_received = false;
+      }
+    }
+
+    return all_boundaries_received;
+  }
+}
+
+void Swarm::ResetCommunication() {
+  auto pmb = GetBlockPointer();
+#ifdef MPI_PARALLEL
+  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
+    NeighborBlock &nb = pmb->pbval->neighbor[n];
+    vbswarm->bd_var_.req_send[nb.bufid] = MPI_REQUEST_NULL;
+  }
+#endif
+
+  // Reset boundary statuses
+  for (int n = 0; n < pmb->pbval->nneighbor; n++) {
+    auto &nb = pmb->pbval->neighbor[n];
+    vbswarm->bd_var_.flag[nb.bufid] = BoundaryStatus::waiting;
+  }
+}
+
+bool Swarm::FinalizeCommunicationIterative() {
+  PARTHENON_THROW("FinalizeCommunicationIterative not yet implemented!");
+  return true;
+}
+
+void Swarm::AllocateComms(std::weak_ptr<MeshBlock> wpmb) {
+  if (wpmb.expired()) return;
+
+  std::shared_ptr<MeshBlock> pmb = wpmb.lock();
+
+  // Create the boundary object
+  vbswarm = std::make_shared<BoundarySwarm>(pmb, label_);
+
+  // Enroll SwarmVariable object
+  vbswarm->bswarm_index = pmb->pbswarm->bswarms.size();
+  pmb->pbswarm->bswarms.push_back(vbswarm);
+}
+
+} // namespace parthenon
diff --git a/src/interface/swarm_container.cpp b/src/interface/swarm_container.cpp
index 6abb9de69721..056d742bcea7 100644
--- a/src/interface/swarm_container.cpp
+++ b/src/interface/swarm_container.cpp
@@ -83,16 +83,15 @@ void SwarmContainer::Remove(const std::string &label) {
 
 // Return swarms meeting some conditions
 SwarmSet SwarmContainer::GetSwarmsByFlag(const Metadata::FlagCollection &flags) {
-  Kokkos::Profiling::pushRegion("GetSwarmsByFlag");
+  PARTHENON_INSTRUMENT
 
   auto swarms = MetadataUtils::GetByFlag<SwarmSet>(flags, swarmMap_, swarmMetadataMap_);
 
-  Kokkos::Profiling::popRegion(); // GetSwarmsByFlag
   return swarms;
 }
 
 TaskStatus SwarmContainer::Defrag(double min_occupancy) {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_Defrag");
+  PARTHENON_INSTRUMENT
   PARTHENON_REQUIRE_THROWS(min_occupancy >= 0. && min_occupancy <= 1.,
                            "Max fractional occupancy of swarm must be >= 0 and <= 1");
 
@@ -103,29 +102,24 @@ TaskStatus SwarmContainer::Defrag(double min_occupancy) {
     }
   }
 
-  Kokkos::Profiling::popRegion();
-
   return TaskStatus::complete;
 }
 
 TaskStatus SwarmContainer::DefragAll() {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_Defrag");
+  PARTHENON_INSTRUMENT
   for (auto &s : swarmVector_) {
     s->Defrag();
   }
-  Kokkos::Profiling::popRegion();
   return TaskStatus::complete;
 }
 
 TaskStatus SwarmContainer::SortParticlesByCell() {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_SortParticlesByCell");
+  PARTHENON_INSTRUMENT
 
   for (auto &s : swarmVector_) {
     s->SortParticlesByCell();
   }
 
-  Kokkos::Profiling::popRegion();
-
   return TaskStatus::complete;
 }
 
@@ -150,18 +144,17 @@ void SwarmContainer::AllocateBoundaries() {
 }
 
 TaskStatus SwarmContainer::Send(BoundaryCommSubset phase) {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_Send");
+  PARTHENON_INSTRUMENT
 
   for (auto &s : swarmVector_) {
     s->Send(phase);
   }
 
-  Kokkos::Profiling::popRegion(); // Task_SwarmContainer_Send
   return TaskStatus::complete;
 }
 
 TaskStatus SwarmContainer::Receive(BoundaryCommSubset phase) {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_Receive");
+  PARTHENON_INSTRUMENT
 
   int success = 0, total = 0;
   for (auto &s : swarmVector_) {
@@ -171,24 +164,22 @@ TaskStatus SwarmContainer::Receive(BoundaryCommSubset phase) {
     total++;
   }
 
-  Kokkos::Profiling::popRegion(); // Task_SwarmContainer_Receive
   if (success == total) return TaskStatus::complete;
   return TaskStatus::incomplete;
 }
 
 TaskStatus SwarmContainer::ResetCommunication() {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_ResetCommunication");
+  PARTHENON_INSTRUMENT
 
   for (auto &s : swarmVector_) {
     s->ResetCommunication();
   }
 
-  Kokkos::Profiling::popRegion(); // Task_SwarmContainer_ResetCommunication
   return TaskStatus::complete;
 }
 
 TaskStatus SwarmContainer::FinalizeCommunicationIterative() {
-  Kokkos::Profiling::pushRegion("Task_SwarmContainer_FinalizeCommunicationIterative");
+  PARTHENON_INSTRUMENT
 
   PARTHENON_THROW("FinalizeCommunicationIterative not yet fully implemented!")
 
@@ -200,7 +191,6 @@ TaskStatus SwarmContainer::FinalizeCommunicationIterative() {
     total++;
   }
 
-  Kokkos::Profiling::popRegion(); // Task_SwarmContainer_FinalizeCommunicationIterative
   if (success == total) return TaskStatus::complete;
   return TaskStatus::incomplete;
 }
diff --git a/src/interface/swarm_device_context.hpp b/src/interface/swarm_device_context.hpp
index e01b542b4925..8e9d7d010083 100644
--- a/src/interface/swarm_device_context.hpp
+++ b/src/interface/swarm_device_context.hpp
@@ -44,7 +44,7 @@ class SwarmDeviceContext {
   bool IsActive(int n) const { return mask_(n); }
 
   KOKKOS_FUNCTION
-  bool IsOnCurrentMeshBlock(int n) const { return blockIndex_(n) == this_block_; }
+  bool IsOnCurrentMeshBlock(int n) const { return block_index_(n) == this_block_; }
 
   KOKKOS_FUNCTION
   void MarkParticleForRemoval(int n) const { marked_for_removal_(n) = true; }
@@ -68,16 +68,16 @@ class SwarmDeviceContext {
 
     // Ignore k,j indices as necessary based on problem dimension
     if (ndim_ == 1) {
-      blockIndex_(n) = neighborIndices_(0, 0, i);
+      block_index_(n) = neighbor_indices_(0, 0, i);
     } else if (ndim_ == 2) {
-      blockIndex_(n) = neighborIndices_(0, j, i);
+      block_index_(n) = neighbor_indices_(0, j, i);
     } else {
-      blockIndex_(n) = neighborIndices_(k, j, i);
+      block_index_(n) = neighbor_indices_(k, j, i);
     }
 
-    is_on_current_mesh_block = (blockIndex_(n) == this_block_);
+    is_on_current_mesh_block = (block_index_(n) == this_block_);
 
-    return blockIndex_(n);
+    return block_index_(n);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -101,14 +101,14 @@ class SwarmDeviceContext {
 
   KOKKOS_INLINE_FUNCTION
   int GetParticleCountPerCell(const int k, const int j, const int i) const {
-    return cellSortedNumber_(k, j, i);
+    return cell_sorted_number_(k, j, i);
   }
 
   KOKKOS_INLINE_FUNCTION
   int GetFullIndex(const int k, const int j, const int i, const int n) const {
-    PARTHENON_DEBUG_REQUIRE(n < cellSortedNumber_(k, j, i),
+    PARTHENON_DEBUG_REQUIRE(n < cell_sorted_number_(k, j, i),
                             "Particle index out of range!");
-    return cellSorted_(cellSortedBegin_(k, j, i) + n).swarm_idx_;
+    return cell_sorted_(cell_sorted_begin_(k, j, i) + n).swarm_idx_;
   }
 
   // private:
@@ -129,11 +129,11 @@ class SwarmDeviceContext {
   Real z_max_global_;
   ParArray1D<bool> mask_;
   ParArray1D<bool> marked_for_removal_;
-  ParArrayND<int> blockIndex_;
-  ParArrayND<int> neighborIndices_; // 4x4x4 array of possible block AMR regions
-  ParArray1D<SwarmKey> cellSorted_;
-  ParArrayND<int> cellSortedBegin_;
-  ParArrayND<int> cellSortedNumber_;
+  ParArrayND<int> block_index_;
+  ParArrayND<int> neighbor_indices_; // 4x4x4 array of possible block AMR regions
+  ParArray1D<SwarmKey> cell_sorted_;
+  ParArrayND<int> cell_sorted_begin_;
+  ParArrayND<int> cell_sorted_number_;
   int ndim_;
   friend class Swarm;
   constexpr static int this_block_ = -1; // Mirrors definition in Swarm class
diff --git a/src/interface/update.cpp b/src/interface/update.cpp
index ea13f83b1790..0e0d1195943e 100644
--- a/src/interface/update.cpp
+++ b/src/interface/update.cpp
@@ -48,7 +48,7 @@ TaskStatus FluxDivergence(MeshBlockData<Real> *in, MeshBlockData<Real> *dudt_con
   const auto &coords = pmb->coords;
   const int ndim = pmb->pmy_mesh->ndim;
   pmb->par_for(
-      "FluxDivergenceBlock", 0, vin.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      PARTHENON_AUTO_LABEL, 0, vin.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int l, const int k, const int j, const int i) {
         if (dudt.IsAllocated(l) && vin.IsAllocated(l)) {
           dudt(l, k, j, i) = FluxDivHelper(l, k, j, i, ndim, coords, vin);
@@ -71,7 +71,7 @@ TaskStatus FluxDivergence(MeshData<Real> *in_obj, MeshData<Real> *dudt_obj) {
 
   const int ndim = vin.GetNdim();
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "FluxDivergenceMesh", DevExecSpace(), 0, vin.GetDim(5) - 1, 0,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, vin.GetDim(5) - 1, 0,
       vin.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int m, const int l, const int k, const int j, const int i) {
         if (dudt.IsAllocated(m, l) && vin.IsAllocated(m, l)) {
@@ -100,8 +100,8 @@ TaskStatus UpdateWithFluxDivergence(MeshBlockData<Real> *u0_data,
   const auto &coords = pmb->coords;
   const int ndim = pmb->pmy_mesh->ndim;
   pmb->par_for(
-      "UpdateWithFluxDivergenceBlock", 0, u0.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s,
-      ib.e, KOKKOS_LAMBDA(const int l, const int k, const int j, const int i) {
+      PARTHENON_AUTO_LABEL, 0, u0.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+      KOKKOS_LAMBDA(const int l, const int k, const int j, const int i) {
         if (u0.IsAllocated(l) && u1.IsAllocated(l)) {
           u0(l, k, j, i) = gam0 * u0(l, k, j, i) + gam1 * u1(l, k, j, i) +
                            beta_dt * FluxDivHelper(l, k, j, i, ndim, coords, u0);
@@ -126,7 +126,7 @@ TaskStatus UpdateWithFluxDivergence(MeshData<Real> *u0_data, MeshData<Real> *u1_
 
   const int ndim = u0_pack.GetNdim();
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "UpdateWithFluxDivergenceMesh", DevExecSpace(), 0,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0,
       u0_pack.GetDim(5) - 1, 0, u0_pack.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int m, const int l, const int k, const int j, const int i) {
         if (u0_pack.IsAllocated(m, l) && u1_pack.IsAllocated(m, l)) {
@@ -140,12 +140,11 @@ TaskStatus UpdateWithFluxDivergence(MeshData<Real> *u0_data, MeshData<Real> *u1_
 }
 
 TaskStatus SparseDealloc(MeshData<Real> *md) {
+  PARTHENON_INSTRUMENT
   if (!Globals::sparse_config.enabled || (md->NumBlocks() == 0)) {
     return TaskStatus::complete;
   }
 
-  Kokkos::Profiling::pushRegion("Task_SparseDealloc");
-
   const IndexRange ib = md->GetBoundsI(IndexDomain::entire);
   const IndexRange jb = md->GetBoundsJ(IndexDomain::entire);
   const IndexRange kb = md->GetBoundsK(IndexDomain::entire);
@@ -163,7 +162,7 @@ TaskStatus SparseDealloc(MeshData<Real> *md) {
   const int NjNi = Nj * Ni;
   const int NkNjNi = Nk * NjNi;
   Kokkos::parallel_for(
-      "SparseDealloc",
+      PARTHENON_AUTO_LABEL,
       Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), pack.GetNBlocks(), Kokkos::AUTO),
       KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
         const int b = team_member.league_rank();
@@ -219,7 +218,6 @@ TaskStatus SparseDealloc(MeshData<Real> *md) {
     }
   }
 
-  Kokkos::Profiling::popRegion(); // Task_SparseDealloc
   return TaskStatus::complete;
 }
 
diff --git a/src/interface/update.hpp b/src/interface/update.hpp
index eb0dbc57d3e6..21f035caefa3 100644
--- a/src/interface/update.hpp
+++ b/src/interface/update.hpp
@@ -70,12 +70,12 @@ TaskStatus UpdateWithFluxDivergence(T *data_u0, T *data_u1, const Real gam0,
 template <typename F, typename T>
 TaskStatus WeightedSumData(const F &flags, T *in1, T *in2, const Real w1, const Real w2,
                            T *out) {
-  Kokkos::Profiling::pushRegion("Task_WeightedSumData");
+  PARTHENON_INSTRUMENT
   const auto &x = in1->PackVariables(flags);
   const auto &y = in2->PackVariables(flags);
   const auto &z = out->PackVariables(flags);
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "WeightedSumData", DevExecSpace(), 0, x.GetDim(5) - 1, 0,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, x.GetDim(5) - 1, 0,
       x.GetDim(4) - 1, 0, x.GetDim(3) - 1, 0, x.GetDim(2) - 1, 0, x.GetDim(1) - 1,
       KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
         // TOOD(someone) This is potentially dangerous and/or not intended behavior
@@ -85,7 +85,6 @@ TaskStatus WeightedSumData(const F &flags, T *in1, T *in2, const Real w1, const
           z(b, l, k, j, i) = w1 * x(b, l, k, j, i) + w2 * y(b, l, k, j, i);
         }
       });
-  Kokkos::Profiling::popRegion(); // Task_WeightedSumData
   return TaskStatus::complete;
 }
 
@@ -96,17 +95,16 @@ TaskStatus CopyData(const F &flags, T *in, T *out) {
 
 template <typename F, typename T>
 TaskStatus SetDataToConstant(const F &flags, T *data, const Real val) {
-  Kokkos::Profiling::pushRegion("Task_SetDataToConstant");
+  PARTHENON_INSTRUMENT
   const auto &x = data->PackVariables(flags);
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "SetDataToConstant", DevExecSpace(), 0, x.GetDim(5) - 1, 0,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, x.GetDim(5) - 1, 0,
       x.GetDim(4) - 1, 0, x.GetDim(3) - 1, 0, x.GetDim(2) - 1, 0, x.GetDim(1) - 1,
       KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
         if (x.IsAllocated(b, l)) {
           x(b, l, k, j, i) = val;
         }
       });
-  Kokkos::Profiling::popRegion(); // Task_SetDataToConstant
   return TaskStatus::complete;
 }
 
@@ -148,7 +146,7 @@ template <typename F, typename T>
 TaskStatus Update2S(const F &flags, T *s0_data, T *s1_data, T *rhs_data,
                     const LowStorageIntegrator *pint, Real dt, int stage,
                     bool update_s1) {
-  Kokkos::Profiling::pushRegion("Task_2S_Update");
+  PARTHENON_INSTRUMENT
   const auto &s0 = s0_data->PackVariables(flags);
   const auto &s1 = s1_data->PackVariables(flags);
   const auto &rhs = rhs_data->PackVariables(flags);
@@ -163,7 +161,7 @@ TaskStatus Update2S(const F &flags, T *s0_data, T *s1_data, T *rhs_data,
   Real gam0 = pint->gam0[stage - 1];
   Real gam1 = pint->gam1[stage - 1];
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "2S_Update", DevExecSpace(), 0, s0.GetDim(5) - 1, 0,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, s0.GetDim(5) - 1, 0,
       s0.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
         if (s0.IsAllocated(b, l) && s1.IsAllocated(b, l) && rhs.IsAllocated(b, l)) {
@@ -174,7 +172,6 @@ TaskStatus Update2S(const F &flags, T *s0_data, T *s1_data, T *rhs_data,
                               beta * dt * rhs(b, l, k, j, i);
         }
       });
-  Kokkos::Profiling::popRegion(); // Task_2S_Update
   return TaskStatus::complete;
 }
 template <typename T>
@@ -194,7 +191,7 @@ TaskStatus SumButcher(const F &flags, std::shared_ptr<T> base_data,
                       std::vector<std::shared_ptr<T>> stage_data,
                       std::shared_ptr<T> out_data, const ButcherIntegrator *pint, Real dt,
                       int stage) {
-  Kokkos::Profiling::pushRegion("Task_Butcher_Sum");
+  PARTHENON_INSTRUMENT
   const auto &out = out_data->PackVariables(flags);
   const auto &in = base_data->PackVariables(flags);
   const IndexDomain interior = IndexDomain::interior;
@@ -202,7 +199,7 @@ TaskStatus SumButcher(const F &flags, std::shared_ptr<T> base_data,
   const IndexRange jb = out_data->GetBoundsJ(interior);
   const IndexRange kb = out_data->GetBoundsK(interior);
   parthenon::par_for(
-      DEFAULT_LOOP_PATTERN, "ButcherSumInit", DevExecSpace(), 0, out.GetDim(5) - 1, 0,
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, out.GetDim(5) - 1, 0,
       out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
       KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
         if (out.IsAllocated(b, l) && in.IsAllocated(b, l)) {
@@ -213,15 +210,14 @@ TaskStatus SumButcher(const F &flags, std::shared_ptr<T> base_data,
     Real a = pint->a[stage - 1][prev];
     const auto &in = stage_data[stage]->PackVariables(flags);
     parthenon::par_for(
-        DEFAULT_LOOP_PATTERN, "ButcherSum", DevExecSpace(), 0, out.GetDim(5) - 1, 0,
-        out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, out.GetDim(5) - 1,
+        0, out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
           if (out.IsAllocated(b, l) && in.IsAllocated(b, l)) {
             out(b, l, k, j, i) += dt * a * in(b, l, k, j, i);
           }
         });
   }
-  Kokkos::Profiling::popRegion(); // Task_Butcher_Sum
   return TaskStatus::complete;
 }
 template <typename T>
@@ -238,7 +234,7 @@ template <typename F, typename T>
 TaskStatus UpdateButcher(const F &flags, std::vector<std::shared_ptr<T>> stage_data,
                          std::shared_ptr<T> out_data, const ButcherIntegrator *pint,
                          Real dt) {
-  Kokkos::Profiling::pushRegion("Task_Butcher_Update");
+  PARTHENON_INSTRUMENT
 
   const auto &out = out_data->PackVariables(flags);
   const IndexDomain interior = IndexDomain::interior;
@@ -251,15 +247,14 @@ TaskStatus UpdateButcher(const F &flags, std::vector<std::shared_ptr<T>> stage_d
     const Real butcher_b = pint->b[stage];
     const auto &in = stage_data[stage]->PackVariables(flags);
     parthenon::par_for(
-        DEFAULT_LOOP_PATTERN, "ButcherUpdate", DevExecSpace(), 0, out.GetDim(5) - 1, 0,
-        out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+        DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, out.GetDim(5) - 1,
+        0, out.GetDim(4) - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
         KOKKOS_LAMBDA(const int b, const int l, const int k, const int j, const int i) {
           if (out.IsAllocated(b, l) && in.IsAllocated(b, l)) {
             out(b, l, k, j, i) += dt * b * in(b, l, k, j, i);
           }
         });
   }
-  Kokkos::Profiling::popRegion(); // Task_Butcher_Update
   return TaskStatus::complete;
 }
 template <typename F, typename T>
@@ -272,53 +267,54 @@ TaskStatus UpdateButcherIndependent(std::vector<std::shared_ptr<T>> stage_data,
 
 template <typename T>
 TaskStatus EstimateTimestep(T *rc) {
-  Kokkos::Profiling::pushRegion("Task_EstimateTimestep");
+  PARTHENON_INSTRUMENT
   Real dt_min = std::numeric_limits<Real>::max();
   for (const auto &pkg : rc->GetParentPointer()->packages.AllPackages()) {
     Real dt = pkg.second->EstimateTimestep(rc);
     dt_min = std::min(dt_min, dt);
   }
   rc->SetAllowedDt(dt_min);
-  Kokkos::Profiling::popRegion(); // Task_EstimateTimestep
   return TaskStatus::complete;
 }
 
 template <typename T>
 TaskStatus PreCommFillDerived(T *rc) {
-  Kokkos::Profiling::pushRegion("Task_PreCommFillDerived");
+  PARTHENON_INSTRUMENT
   auto pm = rc->GetParentPointer();
   for (const auto &pkg : pm->packages.AllPackages()) {
     pkg.second->PreCommFillDerived(rc);
   }
-  Kokkos::Profiling::popRegion();
   return TaskStatus::complete;
 }
 
 template <typename T>
 TaskStatus FillDerived(T *rc) {
-  Kokkos::Profiling::pushRegion("Task_FillDerived");
+  PARTHENON_INSTRUMENT
   auto pm = rc->GetParentPointer();
-  Kokkos::Profiling::pushRegion("PreFillDerived");
-  for (const auto &pkg : pm->packages.AllPackages()) {
-    pkg.second->PreFillDerived(rc);
-  }
-  Kokkos::Profiling::popRegion(); // PreFillDerived
-  Kokkos::Profiling::pushRegion("FillDerived");
-  for (const auto &pkg : pm->packages.AllPackages()) {
-    pkg.second->FillDerived(rc);
-  }
-  Kokkos::Profiling::popRegion(); // FillDerived
-  Kokkos::Profiling::pushRegion("PostFillDerived");
-  for (const auto &pkg : pm->packages.AllPackages()) {
-    pkg.second->PostFillDerived(rc);
-  }
-  Kokkos::Profiling::popRegion(); // PostFillDerived
-  Kokkos::Profiling::popRegion(); // Task_FillDerived
+  { // PreFillDerived region
+    PARTHENON_INSTRUMENT
+    for (const auto &pkg : pm->packages.AllPackages()) {
+      pkg.second->PreFillDerived(rc);
+    }
+  } // PreFillDerived region
+  { // FillDerived region
+    PARTHENON_INSTRUMENT
+    for (const auto &pkg : pm->packages.AllPackages()) {
+      pkg.second->FillDerived(rc);
+    }
+  } // FillDerived region
+  { // PostFillDerived region
+    PARTHENON_INSTRUMENT
+    for (const auto &pkg : pm->packages.AllPackages()) {
+      pkg.second->PostFillDerived(rc);
+    }
+  } // PostFillDerived region
   return TaskStatus::complete;
 }
 
 template <typename T>
 TaskStatus InitNewlyAllocatedVars(T *rc) {
+  PARTHENON_INSTRUMENT
   if (!rc->AllVariablesInitialized()) {
     const IndexDomain interior = IndexDomain::interior;
     const IndexRange ib = rc->GetBoundsI(interior);
@@ -338,7 +334,7 @@ TaskStatus InitNewlyAllocatedVars(T *rc) {
     auto v = desc.GetPack(rc);
 
     Kokkos::parallel_for(
-        "Set newly allocated interior to default",
+        PARTHENON_AUTO_LABEL,
         Kokkos::TeamPolicy<>(parthenon::DevExecSpace(), v.GetNBlocks(), Kokkos::AUTO),
         KOKKOS_LAMBDA(parthenon::team_mbr_t team_member) {
           const int b = team_member.league_rank();
@@ -369,12 +365,10 @@ TaskStatus InitNewlyAllocatedVars(T *rc) {
   // This has to be done even in the case where no blocks have been allocated
   // since the boundaries of allocated blocks could have received default data
   // in any case
-  Kokkos::Profiling::pushRegion("Task_InitNewlyAllocatedVars");
   auto pm = rc->GetParentPointer();
   for (const auto &pkg : pm->packages.AllPackages()) {
     pkg.second->InitNewlyAllocatedVars(rc);
   }
-  Kokkos::Profiling::popRegion();
 
   // Don't worry about flagging variables as initialized
   // since they will be flagged at the beginning of the
diff --git a/src/kokkos_abstraction.hpp b/src/kokkos_abstraction.hpp
index bfaf6a247a66..c2cc09ce879f 100644
--- a/src/kokkos_abstraction.hpp
+++ b/src/kokkos_abstraction.hpp
@@ -27,8 +27,11 @@
 
 #include <Kokkos_Core.hpp>
 
+#include "basic_types.hpp"
 #include "parthenon_array_generic.hpp"
 #include "utils/error_checking.hpp"
+#include "utils/instrument.hpp"
+#include "utils/multi_pointer.hpp"
 #include "utils/object_pool.hpp"
 
 namespace parthenon {
@@ -127,6 +130,17 @@ using ScratchPad5D = Kokkos::View<T *****, LayoutWrapper, ScratchMemSpace, MemUn
 template <typename T>
 using ScratchPad6D = Kokkos::View<T ******, LayoutWrapper, ScratchMemSpace, MemUnmanaged>;
 
+// Used for ParArrayND
+// TODO(JMM): Should all of parthenon_arrays.hpp
+// be moved here? Or should all of the above stuff be moved to
+// parthenon_arrays.hpp?
+inline constexpr std::size_t MAX_VARIABLE_DIMENSION = 7;
+template <typename T, typename Layout = LayoutWrapper>
+using device_view_t =
+    Kokkos::View<multi_pointer_t<T, MAX_VARIABLE_DIMENSION>, Layout, DevMemSpace>;
+template <typename T, typename Layout = LayoutWrapper>
+using host_view_t = typename device_view_t<T, Layout>::HostMirror;
+
 // Defining tags to determine loop_patterns using a tag dispatch design pattern
 
 // Translates a non-Kokkos standard C++ nested `for` loop where the innermost
@@ -334,13 +348,12 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
                          DevExecSpace exec_space, const int &kl, const int &ku,
                          const int &jl, const int &ju, const int &il, const int &iu,
                          const Function &function) {
-  Kokkos::Profiling::pushRegion(name);
+  PARTHENON_INSTRUMENT_REGION(name)
   for (auto k = kl; k <= ku; k++)
     for (auto j = jl; j <= ju; j++)
 #pragma omp simd
       for (auto i = il; i <= iu; i++)
         function(k, j, i);
-  Kokkos::Profiling::popRegion();
 }
 
 // 4D loop using Kokkos 1D Range
@@ -468,14 +481,13 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
                          DevExecSpace exec_space, const int nl, const int nu,
                          const int kl, const int ku, const int jl, const int ju,
                          const int il, const int iu, const Function &function) {
-  Kokkos::Profiling::pushRegion(name);
+  PARTHENON_INSTRUMENT_REGION(name)
   for (auto n = nl; n <= nu; n++)
     for (auto k = kl; k <= ku; k++)
       for (auto j = jl; j <= ju; j++)
 #pragma omp simd
         for (auto i = il; i <= iu; i++)
           function(n, k, j, i);
-  Kokkos::Profiling::popRegion();
 }
 
 // 5D loop using MDRange loops
@@ -536,7 +548,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
                          const int nl, const int nu, const int kl, const int ku,
                          const int jl, const int ju, const int il, const int iu,
                          const Function &function) {
-  Kokkos::Profiling::pushRegion(name);
+  PARTHENON_INSTRUMENT_REGION(name)
   for (auto b = bl; b <= bu; b++)
     for (auto n = nl; n <= nu; n++)
       for (auto k = kl; k <= ku; k++)
@@ -544,7 +556,6 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
 #pragma omp simd
           for (auto i = il; i <= iu; i++)
             function(b, n, k, j, i);
-  Kokkos::Profiling::popRegion();
 }
 
 // 6D loop using MDRange loops
@@ -609,7 +620,7 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
                          const int ml, const int mu, const int nl, const int nu,
                          const int kl, const int ku, const int jl, const int ju,
                          const int il, const int iu, const Function &function) {
-  Kokkos::Profiling::pushRegion(name);
+  PARTHENON_INSTRUMENT_REGION(name)
   for (auto l = ll; l <= lu; l++)
     for (auto m = ml; m <= mu; m++)
       for (auto n = nl; n <= nu; n++)
@@ -618,7 +629,6 @@ inline void par_dispatch(LoopPatternSimdFor, const std::string &name,
 #pragma omp simd
             for (auto i = il; i <= iu; i++)
               function(l, m, n, k, j, i);
-  Kokkos::Profiling::popRegion();
 }
 
 template <class... Args>
diff --git a/src/mesh/amr_loadbalance.cpp b/src/mesh/amr_loadbalance.cpp
index 3db82c4fef65..5c3f847f1e36 100644
--- a/src/mesh/amr_loadbalance.cpp
+++ b/src/mesh/amr_loadbalance.cpp
@@ -127,8 +127,8 @@ bool TryRecvCoarseToFine(int lid_recv, int send_rank, const LogicalLocation &fin
         const int is = (ox1 == 0) ? 0 : (ib_int.e - ib_int.s + 1) / 2;
         const int idx_te = static_cast<int>(te) % 3;
         parthenon::par_for(
-            DEFAULT_LOOP_PATTERN, "ReceiveCoarseToFineAMR", DevExecSpace(), 0, nt, 0, nu,
-            0, nv, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, nt, 0, nu, 0,
+            nv, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
             KOKKOS_LAMBDA(const int t, const int u, const int v, const int k, const int j,
                           const int i) {
               cb(idx_te, t, u, v, k, j, i) = fb(idx_te, t, u, v, k + ks, j + js, i + is);
@@ -220,8 +220,8 @@ bool TryRecvFineToCoarse(int lid_recv, int send_rank, const LogicalLocation &fin
         const int is = (ox1 == 0) ? 0 : (ib.e - ib.s + 1 - TopologicalOffsetI(te));
         const int idx_te = static_cast<int>(te) % 3;
         parthenon::par_for(
-            DEFAULT_LOOP_PATTERN, "ReceiveFineToCoarseAMR", DevExecSpace(), 0, nt, 0, nu,
-            0, nv, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, nt, 0, nu, 0,
+            nv, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
             KOKKOS_LAMBDA(const int t, const int u, const int v, const int k, const int j,
                           const int i) {
               fb(idx_te, t, u, v, k + ks, j + js, i + is) = cb(idx_te, t, u, v, k, j, i);
@@ -318,7 +318,7 @@ bool TryRecvSameToSame(int lid_recv, int send_rank, Variable<Real> *var, MeshBlo
 
 void Mesh::LoadBalancingAndAdaptiveMeshRefinement(ParameterInput *pin,
                                                   ApplicationInput *app_in) {
-  Kokkos::Profiling::pushRegion("LoadBalancingAndAdaptiveMeshRefinement");
+  PARTHENON_INSTRUMENT
   int nnew = 0, ndel = 0;
 
   if (adaptive) {
@@ -343,7 +343,6 @@ void Mesh::LoadBalancingAndAdaptiveMeshRefinement(ParameterInput *pin,
     }
     lb_flag_ = false;
   }
-  Kokkos::Profiling::popRegion(); // LoadBalancingAndAdaptiveMeshRefinement
 }
 
 // Private routines
@@ -406,7 +405,7 @@ void UpdateBlockList(std::vector<int> const &ranklist, std::vector<int> &nslist,
 void Mesh::CalculateLoadBalance(std::vector<double> const &costlist,
                                 std::vector<int> &ranklist, std::vector<int> &nslist,
                                 std::vector<int> &nblist) {
-  Kokkos::Profiling::pushRegion("CalculateLoadBalance");
+  PARTHENON_INSTRUMENT
   auto const total_blocks = costlist.size();
 
   using it = std::vector<double>::const_iterator;
@@ -453,7 +452,6 @@ void Mesh::CalculateLoadBalance(std::vector<double> const &costlist,
                 << std::endl;
     }
   }
-  Kokkos::Profiling::popRegion(); // CalculateLoadBalance
 }
 
 //----------------------------------------------------------------------------------------
@@ -493,7 +491,7 @@ void Mesh::UpdateCostList() {
 // \brief collect refinement flags and manipulate the MeshBlockTree
 
 void Mesh::UpdateMeshBlockTree(int &nnew, int &ndel) {
-  Kokkos::Profiling::pushRegion("UpdateMeshBlockTree");
+  PARTHENON_INSTRUMENT
   // compute nleaf= number of leaf MeshBlocks per refined block
   int nleaf = 2;
   if (!mesh_size.symmetry(X2DIR)) nleaf = 4;
@@ -521,7 +519,6 @@ void Mesh::UpdateMeshBlockTree(int &nnew, int &ndel) {
     tnderef += nderef[n];
   }
   if (tnref == 0 && tnderef < nleaf) { // nothing to do
-    Kokkos::Profiling::popRegion();    // UpdateMeshBlockTree
     return;
   }
 
@@ -623,8 +620,6 @@ void Mesh::UpdateMeshBlockTree(int &nnew, int &ndel) {
     bt->Derefine(ndel);
   }
   if (tnderef >= nleaf) delete[] clderef;
-
-  Kokkos::Profiling::popRegion(); // UpdateMeshBlockTree
 }
 
 //----------------------------------------------------------------------------------------
@@ -665,7 +660,7 @@ bool Mesh::GatherCostListAndCheckBalance() {
 
 void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput *app_in,
                                            int ntot) {
-  Kokkos::Profiling::pushRegion("RedistributeAndRefineMeshBlocks");
+  PARTHENON_INSTRUMENT
   // kill any cached packs
   mesh_data.PurgeNonBase();
   mesh_data.Get()->ClearCaches();
@@ -676,58 +671,57 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
   if (!mesh_size.symmetry(X3DIR)) nleaf = 8;
 
   // construct new lists
-  Kokkos::Profiling::pushRegion("Construct new list");
   std::vector<LogicalLocation> newloc(ntot);
   std::vector<int> newrank(ntot);
   std::vector<double> newcost(ntot);
   std::vector<int> newtoold(ntot);
   std::vector<int> oldtonew(nbtotal);
-
   int nbtold = nbtotal;
-  tree.GetMeshBlockList(newloc.data(), newtoold.data(), nbtotal);
-
-  // create a list mapping the previous gid to the current one
-  oldtonew[0] = 0;
-  int mb_idx = 1;
-  for (int n = 1; n < ntot; n++) {
-    if (newtoold[n] == newtoold[n - 1] + 1) { // normal
-      oldtonew[mb_idx++] = n;
-    } else if (newtoold[n] == newtoold[n - 1] + nleaf) { // derefined
-      for (int j = 0; j < nleaf - 1; j++)
-        oldtonew[mb_idx++] = n - 1;
-      oldtonew[mb_idx++] = n;
-    }
-  }
-  // fill the last block
-  for (; mb_idx < nbtold; mb_idx++)
-    oldtonew[mb_idx] = ntot - 1;
-
-  current_level = 0;
   std::unordered_set<LogicalLocation> newly_refined;
-  for (int n = 0; n < ntot; n++) {
-    // "on" = "old n" = "old gid" = "old global MeshBlock ID"
-    int on = newtoold[n];
-    if (newloc[n].level() > current_level) // set the current max level
-      current_level = newloc[n].level();
-    if (newloc[n].level() >= loclist[on].level()) { // same or refined
-      newcost[n] = costlist[on];
-      // Keep a list of all blocks refined for below
-      if (newloc[n].level() > loclist[on].level()) {
-        newly_refined.insert(newloc[n]);
-      }
-    } else {
-      double acost = 0.0;
-      for (int l = 0; l < nleaf; l++)
-        acost += costlist[on + l];
-      newcost[n] = acost / nleaf;
-    }
-  }
-
   // store old nbstart and nbend before load balancing.
   int onbs = nslist[Globals::my_rank];
   int onbe = onbs + nblist[Globals::my_rank] - 1;
 
-  Kokkos::Profiling::popRegion(); // Construct new list
+  { // Construct new list region
+    PARTHENON_INSTRUMENT
+    tree.GetMeshBlockList(newloc.data(), newtoold.data(), nbtotal);
+
+    // create a list mapping the previous gid to the current one
+    oldtonew[0] = 0;
+    int mb_idx = 1;
+    for (int n = 1; n < ntot; n++) {
+      if (newtoold[n] == newtoold[n - 1] + 1) { // normal
+        oldtonew[mb_idx++] = n;
+      } else if (newtoold[n] == newtoold[n - 1] + nleaf) { // derefined
+        for (int j = 0; j < nleaf - 1; j++)
+          oldtonew[mb_idx++] = n - 1;
+        oldtonew[mb_idx++] = n;
+      }
+    }
+    // fill the last block
+    for (; mb_idx < nbtold; mb_idx++)
+      oldtonew[mb_idx] = ntot - 1;
+
+    current_level = 0;
+    for (int n = 0; n < ntot; n++) {
+      // "on" = "old n" = "old gid" = "old global MeshBlock ID"
+      int on = newtoold[n];
+      if (newloc[n].level() > current_level) // set the current max level
+        current_level = newloc[n].level();
+      if (newloc[n].level() >= loclist[on].level()) { // same or refined
+        newcost[n] = costlist[on];
+        // Keep a list of all blocks refined for below
+        if (newloc[n].level() > loclist[on].level()) {
+          newly_refined.insert(newloc[n]);
+        }
+      } else {
+        double acost = 0.0;
+        for (int l = 0; l < nleaf; l++)
+          acost += costlist[on + l];
+        newcost[n] = acost / nleaf;
+      }
+    }
+  } // Construct new list region
 
   // Calculate new load balance
   CalculateLoadBalance(newcost, newrank, nslist, nblist);
@@ -764,64 +758,67 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
 
 #ifdef MPI_PARALLEL
   // Send data from old to new blocks
-  Kokkos::Profiling::pushRegion("AMR: Send");
   std::vector<MPI_Request> send_reqs;
-  for (int n = onbs; n <= onbe; n++) {
-    int nn = oldtonew[n];
-    LogicalLocation &oloc = loclist[n];
-    LogicalLocation &nloc = newloc[nn];
-    auto pb = FindMeshBlock(n);
-    if (nloc.level() == oloc.level() &&
-        newrank[nn] != Globals::my_rank) { // same level, different rank
-      for (auto &var : pb->vars_cc_)
-        send_reqs.emplace_back(SendSameToSame(nn - nslist[newrank[nn]], newrank[nn],
-                                              var.get(), pb.get(), this));
-    } else if (nloc.level() > oloc.level()) { // c2f
-      // c2f must communicate to multiple leaf blocks (unlike f2c, same2same)
-      for (int l = 0; l < nleaf; l++) {
-        const int nl = nn + l; // Leaf block index in new global block list
-        LogicalLocation &nloc = newloc[nl];
+  { // AMR Send region
+    PARTHENON_INSTRUMENT
+    for (int n = onbs; n <= onbe; n++) {
+      int nn = oldtonew[n];
+      LogicalLocation &oloc = loclist[n];
+      LogicalLocation &nloc = newloc[nn];
+      auto pb = FindMeshBlock(n);
+      if (nloc.level() == oloc.level() &&
+          newrank[nn] != Globals::my_rank) { // same level, different rank
         for (auto &var : pb->vars_cc_)
-          send_reqs.emplace_back(SendCoarseToFine(nl - nslist[newrank[nl]], newrank[nl],
-                                                  nloc, var.get(), this));
-      } // end loop over nleaf (unique to c2f branch in this step 6)
-    } else if (nloc.level() < oloc.level()) { // f2c: restrict + pack + send
-      for (auto &var : pb->vars_cc_)
-        send_reqs.emplace_back(SendFineToCoarse(nn - nslist[newrank[nn]], newrank[nn],
-                                                oloc, var.get(), this));
+          send_reqs.emplace_back(SendSameToSame(nn - nslist[newrank[nn]], newrank[nn],
+                                                var.get(), pb.get(), this));
+      } else if (nloc.level() > oloc.level()) { // c2f
+        // c2f must communicate to multiple leaf blocks (unlike f2c, same2same)
+        for (int l = 0; l < nleaf; l++) {
+          const int nl = nn + l; // Leaf block index in new global block list
+          LogicalLocation &nloc = newloc[nl];
+          for (auto &var : pb->vars_cc_)
+            send_reqs.emplace_back(SendCoarseToFine(nl - nslist[newrank[nl]], newrank[nl],
+                                                    nloc, var.get(), this));
+        } // end loop over nleaf (unique to c2f branch in this step 6)
+      } else if (nloc.level() < oloc.level()) { // f2c: restrict + pack + send
+        for (auto &var : pb->vars_cc_)
+          send_reqs.emplace_back(SendFineToCoarse(nn - nslist[newrank[nn]], newrank[nn],
+                                                  oloc, var.get(), this));
+      }
     }
-  }
-  Kokkos::Profiling::popRegion(); // AMR: Send
-#endif                            // MPI_PARALLEL
+  }    // AMR Send region
+#endif // MPI_PARALLEL
 
   // Construct a new MeshBlock list (moving the data within the MPI rank)
-  Kokkos::Profiling::pushRegion("AMR: Construct new MeshBlockList");
-  RegionSize block_size = GetBlockSize();
-
   BlockList_t new_block_list(nbe - nbs + 1);
-  for (int n = nbs; n <= nbe; n++) {
-    int on = newtoold[n];
-    if ((ranklist[on] == Globals::my_rank) &&
-        (loclist[on].level() == newloc[n].level())) {
-      // on the same MPI rank and same level -> just move it
-      new_block_list[n - nbs] = FindMeshBlock(on);
-      if (!new_block_list[n - nbs]) {
+  { // AMR Construct new MeshBlockList region
+    PARTHENON_INSTRUMENT
+    RegionSize block_size = GetBlockSize();
+
+    for (int n = nbs; n <= nbe; n++) {
+      int on = newtoold[n];
+      if ((ranklist[on] == Globals::my_rank) &&
+          (loclist[on].level() == newloc[n].level())) {
+        // on the same MPI rank and same level -> just move it
+        new_block_list[n - nbs] = FindMeshBlock(on);
+        if (!new_block_list[n - nbs]) {
+          BoundaryFlag block_bcs[6];
+          SetBlockSizeAndBoundaries(newloc[n], block_size, block_bcs);
+          new_block_list[n - nbs] =
+              MeshBlock::Make(n, n - nbs, newloc[n], block_size, block_bcs, this, pin,
+                              app_in, packages, resolved_packages, gflag);
+        }
+      } else {
+        // on a different refinement level or MPI rank - create a new block
         BoundaryFlag block_bcs[6];
         SetBlockSizeAndBoundaries(newloc[n], block_size, block_bcs);
+        // append new block to list of MeshBlocks
         new_block_list[n - nbs] =
             MeshBlock::Make(n, n - nbs, newloc[n], block_size, block_bcs, this, pin,
                             app_in, packages, resolved_packages, gflag);
       }
-    } else {
-      // on a different refinement level or MPI rank - create a new block
-      BoundaryFlag block_bcs[6];
-      SetBlockSizeAndBoundaries(newloc[n], block_size, block_bcs);
-      // append new block to list of MeshBlocks
-      new_block_list[n - nbs] =
-          MeshBlock::Make(n, n - nbs, newloc[n], block_size, block_bcs, this, pin, app_in,
-                          packages, resolved_packages, gflag);
     }
-  }
+  } // AMR Construct new MeshBlockList region
 
   // Replace the MeshBlock list
   auto old_block_list = std::move(block_list);
@@ -833,147 +830,146 @@ void Mesh::RedistributeAndRefineMeshBlocks(ParameterInput *pin, ApplicationInput
     block_list[n - nbs]->lid = n - nbs;
   }
 
-  Kokkos::Profiling::popRegion(); // AMR: Construct new MeshBlockList
-
   // Receive the data and load into MeshBlocks
-  Kokkos::Profiling::pushRegion("AMR: Recv data and unpack");
-  bool all_received;
-  int niter = 0;
-  if (block_list.size() > 0) {
-    // Create a vector for holding the status of all communications, it is sized to fit
-    // the maximal number of calculations that this rank could receive: the number of
-    // blocks on the rank x the number of variables x times the number of fine blocks
-    // that would communicate if every block had been coarsened (8 in 3D)
-    std::vector<bool> finished(
-        std::max((nbe - nbs + 1), 1) * FindMeshBlock(nbs)->vars_cc_.size() * 8, false);
-    do {
-      all_received = true;
-      niter++;
-      int idx = 0;
-      for (int n = nbs; n <= nbe; n++) {
-        int on = newtoold[n];
-        LogicalLocation &oloc = loclist[on];
-        LogicalLocation &nloc = newloc[n];
-        auto pb = FindMeshBlock(n);
-        if (oloc.level() == nloc.level() &&
-            ranklist[on] != Globals::my_rank) { // same level, different rank
+  { // AMR Recv and unpack data
+    PARTHENON_INSTRUMENT
+    bool all_received;
+    int niter = 0;
+    if (block_list.size() > 0) {
+      // Create a vector for holding the status of all communications, it is sized to fit
+      // the maximal number of calculations that this rank could receive: the number of
+      // blocks on the rank x the number of variables x times the number of fine blocks
+      // that would communicate if every block had been coarsened (8 in 3D)
+      std::vector<bool> finished(
+          std::max((nbe - nbs + 1), 1) * FindMeshBlock(nbs)->vars_cc_.size() * 8, false);
+      do {
+        all_received = true;
+        niter++;
+        int idx = 0;
+        for (int n = nbs; n <= nbe; n++) {
+          int on = newtoold[n];
+          LogicalLocation &oloc = loclist[on];
+          LogicalLocation &nloc = newloc[n];
+          auto pb = FindMeshBlock(n);
+          if (oloc.level() == nloc.level() &&
+              ranklist[on] != Globals::my_rank) { // same level, different rank
 #ifdef MPI_PARALLEL
-          for (auto &var : pb->vars_cc_) {
-            if (!finished[idx])
-              finished[idx] =
-                  TryRecvSameToSame(n - nbs, ranklist[on], var.get(), pb.get(), this);
-            all_received = finished[idx++] && all_received;
-          }
+            for (auto &var : pb->vars_cc_) {
+              if (!finished[idx])
+                finished[idx] =
+                    TryRecvSameToSame(n - nbs, ranklist[on], var.get(), pb.get(), this);
+              all_received = finished[idx++] && all_received;
+            }
 #endif
-        } else if (oloc.level() > nloc.level()) { // f2c
-          for (int l = 0; l < nleaf; l++) {
-            auto pob = pb;
-            if (ranklist[on + l] == Globals::my_rank) pob = old_block_list[on + l - onbs];
-            LogicalLocation &oloc = loclist[on + l];
+          } else if (oloc.level() > nloc.level()) { // f2c
+            for (int l = 0; l < nleaf; l++) {
+              auto pob = pb;
+              if (ranklist[on + l] == Globals::my_rank)
+                pob = old_block_list[on + l - onbs];
+              LogicalLocation &oloc = loclist[on + l];
+              for (auto &var : pb->vars_cc_) {
+                if (!finished[idx]) {
+                  auto var_in = pob->meshblock_data.Get()->GetVarPtr(var->label());
+                  finished[idx] =
+                      TryRecvFineToCoarse(n - nbs, ranklist[on + l], oloc, var_in.get(),
+                                          var.get(), pb.get(), this);
+                }
+                all_received = finished[idx++] && all_received;
+              }
+            }
+          } else if (oloc.level() < nloc.level()) { // c2f
             for (auto &var : pb->vars_cc_) {
               if (!finished[idx]) {
+                auto pob = pb;
+                if (ranklist[on] == Globals::my_rank) pob = old_block_list[on - onbs];
                 auto var_in = pob->meshblock_data.Get()->GetVarPtr(var->label());
-                finished[idx] =
-                    TryRecvFineToCoarse(n - nbs, ranklist[on + l], oloc, var_in.get(),
-                                        var.get(), pb.get(), this);
+                finished[idx] = TryRecvCoarseToFine(
+                    n - nbs, ranklist[on], nloc, var_in.get(), var.get(), pb.get(), this);
               }
               all_received = finished[idx++] && all_received;
             }
           }
-        } else if (oloc.level() < nloc.level()) { // c2f
-          for (auto &var : pb->vars_cc_) {
-            if (!finished[idx]) {
-              auto pob = pb;
-              if (ranklist[on] == Globals::my_rank) pob = old_block_list[on - onbs];
-              auto var_in = pob->meshblock_data.Get()->GetVarPtr(var->label());
-              finished[idx] = TryRecvCoarseToFine(
-                  n - nbs, ranklist[on], nloc, var_in.get(), var.get(), pb.get(), this);
-            }
-            all_received = finished[idx++] && all_received;
-          }
         }
-      }
-      // rb_idx is a running index, so we repeat the loop until all vals are true
-    } while (!all_received && niter < 1e7);
-    if (!all_received) PARTHENON_FAIL("AMR Receive failed");
-  }
-  // Fence here to be careful that all communication is finished before moving
-  // on to prolongation
-  Kokkos::fence();
-
-  // Prolongate blocks that had a coarse buffer filled (i.e. c2f blocks)
-  ProResCache_t prolongation_cache;
-  int nprolong = 0;
-  for (int nn = nbs; nn <= nbe; nn++) {
-    int on = newtoold[nn];
-    auto pmb = FindMeshBlock(nn);
-    if (newloc[nn].level() > loclist[on].level()) nprolong += pmb->vars_cc_.size();
-  }
-  prolongation_cache.Initialize(nprolong, resolved_packages.get());
-  int iprolong = 0;
-  for (int nn = nbs; nn <= nbe; nn++) {
-    int on = newtoold[nn];
-    if (newloc[nn].level() > loclist[on].level()) {
+        // rb_idx is a running index, so we repeat the loop until all vals are true
+      } while (!all_received && niter < 1e7);
+      if (!all_received) PARTHENON_FAIL("AMR Receive failed");
+    }
+    // Fence here to be careful that all communication is finished before moving
+    // on to prolongation
+    Kokkos::fence();
+
+    // Prolongate blocks that had a coarse buffer filled (i.e. c2f blocks)
+    ProResCache_t prolongation_cache;
+    int nprolong = 0;
+    for (int nn = nbs; nn <= nbe; nn++) {
+      int on = newtoold[nn];
       auto pmb = FindMeshBlock(nn);
-      for (auto &var : pmb->vars_cc_) {
-        prolongation_cache.RegisterRegionHost(
-            iprolong++,
-            ProResInfo::GetInteriorProlongate(pmb.get(), NeighborBlock(), var), var.get(),
-            resolved_packages.get());
+      if (newloc[nn].level() > loclist[on].level()) nprolong += pmb->vars_cc_.size();
+    }
+    prolongation_cache.Initialize(nprolong, resolved_packages.get());
+    int iprolong = 0;
+    for (int nn = nbs; nn <= nbe; nn++) {
+      int on = newtoold[nn];
+      if (newloc[nn].level() > loclist[on].level()) {
+        auto pmb = FindMeshBlock(nn);
+        for (auto &var : pmb->vars_cc_) {
+          prolongation_cache.RegisterRegionHost(
+              iprolong++,
+              ProResInfo::GetInteriorProlongate(pmb.get(), NeighborBlock(), var),
+              var.get(), resolved_packages.get());
+        }
       }
+      prolongation_cache.CopyToDevice();
     }
-  }
-  prolongation_cache.CopyToDevice();
-
-  refinement::ProlongateShared(resolved_packages.get(), prolongation_cache,
-                               block_list[0]->cellbounds, block_list[0]->c_cellbounds);
-
-  // update the lists
-  loclist = std::move(newloc);
-  ranklist = std::move(newrank);
-  costlist = std::move(newcost);
-  PopulateLeafLocationMap();
-
-  // A block newly refined and prolongated may have neighbors which were
-  // already refined to the new level.
-  // If so, the prolongated versions of shared elements will not reflect
-  // the true, finer versions present in the neighbor block.
-  // We must create any new fine buffers and fill them from these neighbors
-  // in order to maintain a consistent global state.
-  // Thus we rebuild and synchronize the mesh now, but using a unique
-  // neighbor precedence favoring the "old" fine blocks over "new" ones
-  for (auto &pmb : block_list) {
-    pmb->pbval->SearchAndSetNeighbors(this, tree, ranklist.data(), nslist.data(),
-                                      newly_refined);
-  }
-  // Make sure all old sends/receives are done before we reconfigure the mesh
-#ifdef MPI_PARALLEL
-  if (send_reqs.size() != 0)
-    PARTHENON_MPI_CHECK(
-        MPI_Waitall(send_reqs.size(), send_reqs.data(), MPI_STATUSES_IGNORE));
-#endif
-  // Re-initialize the mesh with our temporary ownership/neighbor configurations.
-  // No buffers are different when we switch to the final precedence order.
-  SetSameLevelNeighbors(block_list, leaf_grid_locs, this->GetRootGridInfo(), nbs, false,
-                        0, newly_refined);
-  BuildGMGHierarchy(nbs, pin, app_in);
-  Initialize(false, pin, app_in);
-
-  // Internal refinement relies on the fine shared values, which are only consistent after
-  // being updated with any previously fine versions
-  refinement::ProlongateInternal(resolved_packages.get(), prolongation_cache,
+    refinement::ProlongateShared(resolved_packages.get(), prolongation_cache,
                                  block_list[0]->cellbounds, block_list[0]->c_cellbounds);
 
-  // Rebuild just the ownership model, this time weighting the "new" fine blocks just like
-  // any other blocks at their level.
-  SetSameLevelNeighbors(block_list, leaf_grid_locs, this->GetRootGridInfo(), nbs, false);
-  for (auto &pmb : block_list) {
-    pmb->pbval->SearchAndSetNeighbors(this, tree, ranklist.data(), nslist.data());
-  }
-
-  Kokkos::Profiling::popRegion(); // AMR: Recv data and unpack
+    // update the lists
+    loclist = std::move(newloc);
+    ranklist = std::move(newrank);
+    costlist = std::move(newcost);
+    PopulateLeafLocationMap();
+
+    // A block newly refined and prolongated may have neighbors which were
+    // already refined to the new level.
+    // If so, the prolongated versions of shared elements will not reflect
+    // the true, finer versions present in the neighbor block.
+    // We must create any new fine buffers and fill them from these neighbors
+    // in order to maintain a consistent global state.
+    // Thus we rebuild and synchronize the mesh now, but using a unique
+    // neighbor precedence favoring the "old" fine blocks over "new" ones
+    for (auto &pmb : block_list) {
+      pmb->pbval->SearchAndSetNeighbors(this, tree, ranklist.data(), nslist.data(),
+                                        newly_refined);
+    }
+    // Make sure all old sends/receives are done before we reconfigure the mesh
+#ifdef MPI_PARALLEL
+    if (send_reqs.size() != 0)
+      PARTHENON_MPI_CHECK(
+          MPI_Waitall(send_reqs.size(), send_reqs.data(), MPI_STATUSES_IGNORE));
+#endif
+    // Re-initialize the mesh with our temporary ownership/neighbor configurations.
+    // No buffers are different when we switch to the final precedence order.
+    SetSameLevelNeighbors(block_list, leaf_grid_locs, this->GetRootGridInfo(), nbs, false,
+                          0, newly_refined);
+    BuildGMGHierarchy(nbs, pin, app_in);
+    Initialize(false, pin, app_in);
+
+    // Internal refinement relies on the fine shared values, which are only consistent
+    // after being updated with any previously fine versions
+    refinement::ProlongateInternal(resolved_packages.get(), prolongation_cache,
+                                   block_list[0]->cellbounds,
+                                   block_list[0]->c_cellbounds);
+
+    // Rebuild just the ownership model, this time weighting the "new" fine blocks just
+    // like any other blocks at their level.
+    SetSameLevelNeighbors(block_list, leaf_grid_locs, this->GetRootGridInfo(), nbs,
+                          false);
+    for (auto &pmb : block_list) {
+      pmb->pbval->SearchAndSetNeighbors(this, tree, ranklist.data(), nslist.data());
+    }
+  } // AMR Recv and unpack data
 
   ResetLoadBalanceVariables();
-  Kokkos::Profiling::popRegion(); // RedistributeAndRefineMeshBlocks
 }
 } // namespace parthenon
diff --git a/src/mesh/domain.hpp b/src/mesh/domain.hpp
index 39fe63aea704..49ec1a37af67 100644
--- a/src/mesh/domain.hpp
+++ b/src/mesh/domain.hpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2014 James M. Stone <jmstone@princeton.edu> and other code contributors
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -23,6 +23,7 @@
 #include <type_traits>
 #include <vector>
 
+#include "basic_types.hpp"
 #include "defs.hpp"
 
 namespace parthenon {
diff --git a/src/mesh/mesh-gmg.cpp b/src/mesh/mesh-gmg.cpp
index 8b025776ff39..429cac855543 100644
--- a/src/mesh/mesh-gmg.cpp
+++ b/src/mesh/mesh-gmg.cpp
@@ -83,6 +83,30 @@ void Mesh::SetSameLevelNeighbors(
           for (auto ox2 : offsets[1]) {
             for (auto ox3 : offsets[2]) {
               NeighborConnect nc;
+              if (pos_neighbor_location.level() != loc.level()) {
+                // Check that the two blocks are in fact neighbors in this offset
+                // direction, since we only checked that they are actually neighbors
+                // when they have both been derefined to the coarser of their levels
+                // (this should only play a role in small meshes with periodic
+                // bounday conditions)
+                auto &fine_loc = pos_neighbor_location.level() > loc.level()
+                                     ? pos_neighbor_location
+                                     : loc;
+                int mult = loc.level() - pos_neighbor_location.level();
+                std::array<int, 3> ox{ox1, ox2, ox3};
+                bool not_neighbor = false;
+                for (int dir = 0; dir < 3; ++dir) {
+                  if (ox[dir] != 0) {
+                    // temp should be +1 if a block is to the right within its parent
+                    // block and -1 if it is to the left.
+                    const int temp =
+                        2 * (fine_loc.l(dir) - 2 * (fine_loc.l(dir) >> 1)) - 1;
+                    PARTHENON_DEBUG_REQUIRE(temp * temp == 1, "Bad Offset");
+                    if (temp != mult * ox[dir]) not_neighbor = true;
+                  }
+                }
+                if (not_neighbor) continue;
+              }
               int connect_indicator = std::abs(ox1) + std::abs(ox2) + std::abs(ox3);
               if (connect_indicator == 0) continue;
               if (connect_indicator == 1) {
diff --git a/src/mesh/mesh.cpp b/src/mesh/mesh.cpp
index 77dc244644c4..cee6af316fc8 100644
--- a/src/mesh/mesh.cpp
+++ b/src/mesh/mesh.cpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2014 James M. Stone <jmstone@princeton.edu> and other code contributors
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -156,12 +156,18 @@ Mesh::Mesh(ParameterInput *pin, ApplicationInput *app_in, Packages_t &packages,
   if (app_in->MeshProblemGenerator != nullptr) {
     ProblemGenerator = app_in->MeshProblemGenerator;
   }
+  if (app_in->MeshPostInitialization != nullptr) {
+    PostInitialization = app_in->MeshPostInitialization;
+  }
   if (app_in->PreStepMeshUserWorkInLoop != nullptr) {
     PreStepUserWorkInLoop = app_in->PreStepMeshUserWorkInLoop;
   }
   if (app_in->PostStepMeshUserWorkInLoop != nullptr) {
     PostStepUserWorkInLoop = app_in->PostStepMeshUserWorkInLoop;
   }
+  if (app_in->UserMeshWorkBeforeOutput != nullptr) {
+    UserMeshWorkBeforeOutput = app_in->UserMeshWorkBeforeOutput;
+  }
   if (app_in->PreStepDiagnosticsInLoop != nullptr) {
     PreStepUserDiagnosticsInLoop = app_in->PreStepDiagnosticsInLoop;
   }
@@ -900,7 +906,12 @@ void Mesh::EnrollBndryFncts_(ApplicationInput *app_in) {
 // \!fn void Mesh::ApplyUserWorkBeforeOutput(ParameterInput *pin)
 // \brief Apply MeshBlock::UserWorkBeforeOutput
 
-void Mesh::ApplyUserWorkBeforeOutput(ParameterInput *pin) {
+void Mesh::ApplyUserWorkBeforeOutput(Mesh *mesh, ParameterInput *pin,
+                                     SimTime const &time) {
+  // call Mesh version
+  mesh->UserMeshWorkBeforeOutput(mesh, pin, time);
+
+  // call MeshBlock version
   for (auto &pmb : block_list) {
     pmb->UserWorkBeforeOutput(pmb.get(), pin);
   }
@@ -911,7 +922,7 @@ void Mesh::ApplyUserWorkBeforeOutput(ParameterInput *pin) {
 // \brief  initialization before the main loop as well as during remeshing
 
 void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *app_in) {
-  Kokkos::Profiling::pushRegion("Mesh::Initialize");
+  PARTHENON_INSTRUMENT
   bool init_done = true;
   const int nb_initial = nbtotal;
   do {
@@ -930,6 +941,10 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *
       PARTHENON_REQUIRE_THROWS(
           !(ProblemGenerator != nullptr && block_list[0]->ProblemGenerator != nullptr),
           "Mesh and MeshBlock ProblemGenerators are defined. Please use only one.");
+      PARTHENON_REQUIRE_THROWS(
+          !(PostInitialization != nullptr &&
+            block_list[0]->PostInitialization != nullptr),
+          "Mesh and MeshBlock PostInitializations are defined. Please use only one.");
 
       // Call Mesh ProblemGenerator
       if (ProblemGenerator != nullptr) {
@@ -946,6 +961,23 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *
           pmb->ProblemGenerator(pmb.get(), pin);
         }
       }
+
+      // Call Mesh PostInitialization
+      if (PostInitialization != nullptr) {
+        PARTHENON_REQUIRE(num_partitions == 1,
+                          "Mesh PostInitialization requires parthenon/mesh/pack_size=-1 "
+                          "during first initialization.");
+
+        auto &md = mesh_data.GetOrAdd("base", 0);
+        PostInitialization(this, pin, md.get());
+        // Call individual MeshBlock PostInitialization
+      } else {
+        for (int i = 0; i < nmb; ++i) {
+          auto &pmb = block_list[i];
+          pmb->PostInitialization(pmb.get(), pin);
+        }
+      }
+
       std::for_each(block_list.begin(), block_list.end(),
                     [](auto &sp_block) { sp_block->SetAllVariablesToInitialized(); });
     }
@@ -1112,8 +1144,6 @@ void Mesh::Initialize(bool init_problem, ParameterInput *pin, ApplicationInput *
 
   // Initialize the "base" MeshData object
   mesh_data.Get()->Set(block_list, this);
-
-  Kokkos::Profiling::popRegion(); // Mesh::Initialize
 }
 
 /// Finds location of a block with ID `tgid`.
@@ -1163,7 +1193,6 @@ bool Mesh::SetBlockSizeAndBoundaries(LogicalLocation loc, RegionSize &block_size
 
 RegionSize Mesh::GetBlockSize(const LogicalLocation &loc) const {
   RegionSize block_size = GetBlockSize();
-  bool valid_region = true;
   for (auto &dir : {X1DIR, X2DIR, X3DIR}) {
     block_size.xrat(dir) = mesh_size.xrat(dir);
     block_size.symmetry(dir) = mesh_size.symmetry(dir);
@@ -1182,8 +1211,6 @@ RegionSize Mesh::GetBlockSize(const LogicalLocation &loc) const {
         PARTHENON_REQUIRE(loc.level() < root_level, "Something is messed up.");
         std::int64_t loc_low = loc.l(dir - 1) << (root_level - loc.level());
         std::int64_t loc_hi = (loc.l(dir - 1) + 1) << (root_level - loc.level());
-        if (block_size.nx(dir) * (nrbx[dir - 1] - loc_low) % (loc_hi - loc_low) != 0)
-          valid_region = false;
         block_size.nx(dir) =
             block_size.nx(dir) * (nrbx[dir - 1] - loc_low) / (loc_hi - loc_low);
         block_size.xmax(dir) = mesh_size.xmax(dir);
@@ -1207,6 +1234,22 @@ int Mesh::GetNumberOfMeshBlockCells() const {
 }
 const RegionSize &Mesh::GetBlockSize() const { return base_block_size; }
 
+const IndexShape &Mesh::GetLeafBlockCellBounds(CellLevel level) const {
+  // TODO(JMM): Luke this is for your Metadata::fine stuff.
+  PARTHENON_DEBUG_REQUIRE(level != CellLevel::fine,
+                          "Currently no access to finer cellbounds");
+  MeshBlock *pmb = block_list[0].get();
+  if (level == CellLevel::same) {
+    return pmb->cellbounds;
+    // TODO(JMM):
+    // } else if (level == CellLevel::fine) {
+    //   return pmb->fine_cellbounds;
+    // }
+  } else { // if (level == CellLevel::coarse) {
+    return pmb->c_cellbounds;
+  }
+}
+
 // Functionality re-used in mesh constructor
 void Mesh::RegisterLoadBalancing_(ParameterInput *pin) {
 #ifdef MPI_PARALLEL // JMM: Not sure this ifdef is needed
diff --git a/src/mesh/mesh.hpp b/src/mesh/mesh.hpp
index bf21853d3a82..6963b30b046f 100644
--- a/src/mesh/mesh.hpp
+++ b/src/mesh/mesh.hpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2014 James M. Stone <jmstone@princeton.edu> and other code contributors
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -98,6 +98,7 @@ class Mesh {
   int GetNumberOfMeshBlockCells() const;
   const RegionSize &GetBlockSize() const;
   RegionSize GetBlockSize(const LogicalLocation &loc) const;
+  const IndexShape &GetLeafBlockCellBounds(CellLevel level = CellLevel::same) const;
 
   // data
   bool modified;
@@ -150,7 +151,7 @@ class Mesh {
 
   std::shared_ptr<MeshBlock> FindMeshBlock(int tgid) const;
 
-  void ApplyUserWorkBeforeOutput(ParameterInput *pin);
+  void ApplyUserWorkBeforeOutput(Mesh *mesh, ParameterInput *pin, SimTime const &time);
 
   // Boundary Functions
   BValFunc MeshBndryFnctn[BOUNDARY_NFACES];
@@ -160,6 +161,8 @@ class Mesh {
   // defined in either the prob file or default_pgen.cpp in ../pgen/
   std::function<void(Mesh *, ParameterInput *, MeshData<Real> *)> ProblemGenerator =
       nullptr;
+  std::function<void(Mesh *, ParameterInput *, MeshData<Real> *)> PostInitialization =
+      nullptr;
   static void UserWorkAfterLoopDefault(Mesh *mesh, ParameterInput *pin,
                                        SimTime &tm); // called in main loop
   std::function<void(Mesh *, ParameterInput *, SimTime &)> UserWorkAfterLoop =
@@ -172,6 +175,10 @@ class Mesh {
   std::function<void(Mesh *, ParameterInput *, SimTime const &)> PostStepUserWorkInLoop =
       &UserWorkInLoopDefault;
 
+  static void UserMeshWorkBeforeOutputDefault(Mesh *, ParameterInput *, SimTime const &);
+  std::function<void(Mesh *, ParameterInput *, SimTime const &)>
+      UserMeshWorkBeforeOutput = &UserMeshWorkBeforeOutputDefault;
+
   static void PreStepUserDiagnosticsInLoopDefault(Mesh *, ParameterInput *,
                                                   SimTime const &);
   std::function<void(Mesh *, ParameterInput *, SimTime const &)>
@@ -194,6 +201,20 @@ class Mesh {
   std::vector<int> GetNbList() const noexcept { return nblist; }
   std::vector<LogicalLocation> GetLocList() const noexcept { return loclist; }
 
+  // TODO(JMM): Put in implementation file?
+  auto GetLevelsAndLogicalLocationsFlat() const noexcept {
+    std::vector<std::int64_t> levels, logicalLocations;
+    levels.reserve(nbtotal);
+    logicalLocations.reserve(nbtotal * 3);
+    for (const auto &loc : loclist) {
+      levels.push_back(loc.level() - GetRootLevel());
+      logicalLocations.push_back(loc.lx1());
+      logicalLocations.push_back(loc.lx2());
+      logicalLocations.push_back(loc.lx3());
+    }
+    return std::make_pair(levels, logicalLocations);
+  }
+
   void OutputMeshStructure(const int dim, const bool dump_mesh_structure = true);
 
   // Ordering here is important to prevent deallocation of pools before boundary
diff --git a/src/mesh/meshblock.cpp b/src/mesh/meshblock.cpp
index e2d7af7c6ac0..409f29b1cd57 100644
--- a/src/mesh/meshblock.cpp
+++ b/src/mesh/meshblock.cpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2014 James M. Stone <jmstone@princeton.edu> and other code contributors
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -115,6 +115,12 @@ void MeshBlock::Initialize(int igid, int ilid, LogicalLocation iloc,
   } else if (app_in->MeshProblemGenerator == nullptr) {
     ProblemGenerator = &ProblemGeneratorDefault;
   }
+  if (app_in->PostInitialization != nullptr) {
+    PostInitialization = app_in->PostInitialization;
+    // Only set default post-init when no mesh post-init is set
+  } else if (app_in->MeshPostInitialization == nullptr) {
+    PostInitialization = &PostInitializationDefault;
+  }
   if (app_in->MeshBlockUserWorkBeforeOutput != nullptr) {
     UserWorkBeforeOutput = app_in->MeshBlockUserWorkBeforeOutput;
   }
diff --git a/src/mesh/meshblock.hpp b/src/mesh/meshblock.hpp
index 3df86d7f73e7..a7b16b9955ac 100644
--- a/src/mesh/meshblock.hpp
+++ b/src/mesh/meshblock.hpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2014 James M. Stone <jmstone@princeton.edu> and other code contributors
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -440,7 +440,9 @@ class MeshBlock : public std::enable_shared_from_this<MeshBlock> {
 
   // defined in either the prob file or default_pgen.cpp in ../pgen/
   static void ProblemGeneratorDefault(MeshBlock *pmb, ParameterInput *pin);
+  static void PostInitializationDefault(MeshBlock *pmb, ParameterInput *pin);
   std::function<void(MeshBlock *, ParameterInput *)> ProblemGenerator = nullptr;
+  std::function<void(MeshBlock *, ParameterInput *)> PostInitialization = nullptr;
   static pMeshBlockApplicationData_t
   InitApplicationMeshBlockDataDefault(MeshBlock *, ParameterInput *pin);
   std::function<pMeshBlockApplicationData_t(MeshBlock *, ParameterInput *)>
diff --git a/src/outputs/ascent.cpp b/src/outputs/ascent.cpp
index 57db543f0900..13af3b50f613 100644
--- a/src/outputs/ascent.cpp
+++ b/src/outputs/ascent.cpp
@@ -150,7 +150,7 @@ void AscentOutput::WriteOutputFile(Mesh *pm, ParameterInput *pin, SimTime *tm,
       const int njni = nj * ni;
       auto &ghost_mask = ghost_mask_; // redef to lambda capture class member
       pmb->par_for(
-          "Set ascent ghost mask", 0, ncells - 1, KOKKOS_LAMBDA(const int &idx) {
+          PARTHENON_AUTO_LABEL, 0, ncells - 1, KOKKOS_LAMBDA(const int &idx) {
             const int k = idx / (njni);
             const int j = (idx - k * njni) / ni;
             const int i = idx - k * njni - j * nj;
diff --git a/src/outputs/output_utils.cpp b/src/outputs/output_utils.cpp
index c6d05451ea29..64c099d85617 100644
--- a/src/outputs/output_utils.cpp
+++ b/src/outputs/output_utils.cpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2023 The Parthenon collaboration
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -100,6 +100,71 @@ AllSwarmInfo::AllSwarmInfo(BlockList_t &block_list,
   }
 }
 
+// Tools that can be shared accross Output types
+
+std::vector<Real> ComputeXminBlocks(Mesh *pm) {
+  return FlattenBlockInfo<Real>(pm, pm->ndim,
+                                [=](MeshBlock *pmb, std::vector<Real> &data, int &i) {
+                                  auto xmin = pmb->coords.GetXmin();
+                                  data[i++] = xmin[0];
+                                  if (pm->ndim > 1) {
+                                    data[i++] = xmin[1];
+                                  }
+                                  if (pm->ndim > 2) {
+                                    data[i++] = xmin[2];
+                                  }
+                                });
+}
+
+std::vector<int64_t> ComputeLocs(Mesh *pm) {
+  return FlattenBlockInfo<int64_t>(
+      pm, 3, [=](MeshBlock *pmb, std::vector<int64_t> &locs, int &i) {
+        locs[i++] = pmb->loc.lx1();
+        locs[i++] = pmb->loc.lx2();
+        locs[i++] = pmb->loc.lx3();
+      });
+}
+
+std::vector<int> ComputeIDsAndFlags(Mesh *pm) {
+  return FlattenBlockInfo<int>(pm, 5,
+                               [=](MeshBlock *pmb, std::vector<int> &data, int &i) {
+                                 data[i++] = pmb->loc.level();
+                                 data[i++] = pmb->gid;
+                                 data[i++] = pmb->lid;
+                                 data[i++] = pmb->cnghost;
+                                 data[i++] = pmb->gflag;
+                               });
+}
+
+// TODO(JMM): I could make this use the other loop
+// functionality/high-order functions.  but it was more code than this
+// for, I think, little benefit.
+void ComputeCoords(Mesh *pm, bool face, const IndexRange &ib, const IndexRange &jb,
+                   const IndexRange &kb, std::vector<Real> &x, std::vector<Real> &y,
+                   std::vector<Real> &z) {
+  const int nx1 = ib.e - ib.s + 1;
+  const int nx2 = jb.e - jb.s + 1;
+  const int nx3 = kb.e - kb.s + 1;
+  const int num_blocks = pm->block_list.size();
+  x.resize((nx1 + face) * num_blocks);
+  y.resize((nx2 + face) * num_blocks);
+  z.resize((nx3 + face) * num_blocks);
+  std::size_t idx_x = 0, idx_y = 0, idx_z = 0;
+
+  // note relies on casting of bool to int
+  for (auto &pmb : pm->block_list) {
+    for (int i = ib.s; i <= ib.e + face; ++i) {
+      x[idx_x++] = face ? pmb->coords.Xf<1>(i) : pmb->coords.Xc<1>(i);
+    }
+    for (int j = jb.s; j <= jb.e + face; ++j) {
+      y[idx_y++] = face ? pmb->coords.Xf<2>(j) : pmb->coords.Xc<2>(j);
+    }
+    for (int k = kb.s; k <= kb.e + face; ++k) {
+      z[idx_z++] = face ? pmb->coords.Xf<3>(k) : pmb->coords.Xc<3>(k);
+    }
+  }
+}
+
 // TODO(JMM): may need to generalize this
 std::size_t MPIPrefixSum(std::size_t local, std::size_t &tot_count) {
   std::size_t out = 0;
diff --git a/src/outputs/output_utils.hpp b/src/outputs/output_utils.hpp
index c42a235d6a85..db6353090cbf 100644
--- a/src/outputs/output_utils.hpp
+++ b/src/outputs/output_utils.hpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2023 The Parthenon collaboration
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -29,8 +29,10 @@
 #include <vector>
 
 // Parthenon
+#include "basic_types.hpp"
 #include "interface/metadata.hpp"
 #include "interface/variable.hpp"
+#include "mesh/domain.hpp"
 #include "mesh/mesh.hpp"
 #include "mesh/meshblock.hpp"
 #include "utils/error_checking.hpp"
@@ -53,6 +55,7 @@ struct VarInfo {
   bool is_sparse;
   bool is_vector;
   std::vector<std::string> component_labels;
+  int Size() const { return nx6 * nx5 * nx4 * nx3 * nx2 * nx1; }
 
   VarInfo() = delete;
 
@@ -200,6 +203,59 @@ struct AllSwarmInfo {
                bool is_restart);
 };
 
+template <typename T, typename Function_t>
+std::vector<T> FlattenBlockInfo(Mesh *pm, int shape, Function_t f) {
+  const int num_blocks_local = static_cast<int>(pm->block_list.size());
+  std::vector<T> data(shape * num_blocks_local);
+  int i = 0;
+  for (auto &pmb : pm->block_list) {
+    f(pmb.get(), data, i);
+  }
+  return data;
+}
+
+// mirror must be provided because copying done externally
+template <typename Data_t, typename idx_t, typename Function_t>
+void PackOrUnpackVar(MeshBlock *pmb, Variable<Real> *pvar, bool do_ghosts, idx_t &idx,
+                     std::vector<Data_t> &data, Function_t f) {
+  const auto &Nt = pvar->GetDim(6);
+  const auto &Nu = pvar->GetDim(5);
+  const auto &Nv = pvar->GetDim(4);
+  const IndexDomain domain = (do_ghosts ? IndexDomain::entire : IndexDomain::interior);
+  IndexRange kb, jb, ib;
+  if (pvar->metadata().Where() == MetadataFlag(Metadata::Cell)) {
+    kb = pmb->cellbounds.GetBoundsK(domain);
+    jb = pmb->cellbounds.GetBoundsJ(domain);
+    ib = pmb->cellbounds.GetBoundsI(domain);
+    // TODO(JMM): Add topological elements here
+  } else { // metadata none
+    kb = {0, pvar->GetDim(3) - 1};
+    jb = {0, pvar->GetDim(2) - 1};
+    ib = {0, pvar->GetDim(1) - 1};
+  }
+  for (int t = 0; t < Nt; ++t) {
+    for (int u = 0; u < Nu; ++u) {
+      for (int v = 0; v < Nv; ++v) {
+        for (int k = kb.s; k <= kb.e; ++k) {
+          for (int j = jb.s; j <= jb.e; ++j) {
+            for (int i = ib.s; i <= ib.e; ++i) {
+              f(idx, t, u, v, k, j, i);
+              idx++;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void ComputeCoords(Mesh *pm, bool face, const IndexRange &ib, const IndexRange &jb,
+                   const IndexRange &kb, std::vector<Real> &x, std::vector<Real> &y,
+                   std::vector<Real> &z);
+std::vector<Real> ComputeXminBlocks(Mesh *pm);
+std::vector<int64_t> ComputeLocs(Mesh *pm);
+std::vector<int> ComputeIDsAndFlags(Mesh *pm);
+
 // TODO(JMM): Potentially unsafe if MPI_UNSIGNED_LONG_LONG isn't a size_t
 // however I think it's probably safe to assume we'll be on systems
 // where this is the case?
diff --git a/src/outputs/outputs.cpp b/src/outputs/outputs.cpp
index 33b35a24f284..1c6c967c9c0c 100644
--- a/src/outputs/outputs.cpp
+++ b/src/outputs/outputs.cpp
@@ -1,6 +1,6 @@
 //========================================================================================
 // Parthenon performance portable AMR framework
-// Copyright(C) 2020-2023 The Parthenon collaboration
+// Copyright(C) 2020-2024 The Parthenon collaboration
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
 // Athena++ astrophysical MHD code
@@ -263,6 +263,7 @@ Outputs::Outputs(Mesh *pm, ParameterInput *pin, SimTime *tm) {
           num_rst_outputs++;
         }
 #ifdef ENABLE_HDF5
+        op.write_xdmf = pin->GetOrAddBoolean(op.block_name, "write_xdmf", true);
         pnew_type = new PHDF5Output(op, restart);
 #else
         msg << "### FATAL ERROR in Outputs constructor" << std::endl
@@ -413,7 +414,7 @@ void OutputType::ClearOutputData() {
 
 void Outputs::MakeOutputs(Mesh *pm, ParameterInput *pin, SimTime *tm,
                           const SignalHandler::OutputSignal signal) {
-  Kokkos::Profiling::pushRegion("MakeOutputs");
+  PARTHENON_INSTRUMENT
   bool first = true;
   OutputType *ptype = pfirst_type_;
   while (ptype != nullptr) {
@@ -422,14 +423,13 @@ void Outputs::MakeOutputs(Mesh *pm, ParameterInput *pin, SimTime *tm,
          ((tm->ncycle == 0) || (tm->time >= ptype->output_params.next_time) ||
           (tm->time >= tm->tlim) || (signal != SignalHandler::OutputSignal::none)))) {
       if (first && ptype->output_params.file_type != "hst") {
-        pm->ApplyUserWorkBeforeOutput(pin);
+        pm->ApplyUserWorkBeforeOutput(pm, pin, *tm);
         first = false;
       }
       ptype->WriteOutputFile(pm, pin, tm, signal);
     }
     ptype = ptype->pnext_type; // move to next OutputType node in singly linked list
   }
-  Kokkos::Profiling::popRegion(); // MakeOutputs
 }
 
 } // namespace parthenon
diff --git a/src/outputs/outputs.hpp b/src/outputs/outputs.hpp
index 4fd64236a19c..ff268926d7e1 100644
--- a/src/outputs/outputs.hpp
+++ b/src/outputs/outputs.hpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2014 James M. Stone <jmstone@princeton.edu> and other code contributors
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -64,12 +64,13 @@ struct OutputParameters {
   bool single_precision_output;
   bool sparse_seed_nans;
   int hdf5_compression_level;
+  bool write_xdmf;
   // TODO(felker): some of the parameters in this class are not initialized in constructor
   OutputParameters()
       : block_number(0), next_time(0.0), dt(-1.0), file_number(0),
         include_ghost_zones(false), cartesian_vector(false),
         single_precision_output(false), sparse_seed_nans(false),
-        hdf5_compression_level(5) {}
+        hdf5_compression_level(5), write_xdmf(false) {}
 };
 
 //----------------------------------------------------------------------------------------
@@ -208,14 +209,18 @@ class PHDF5Output : public OutputType {
  private:
   std::string GenerateFilename_(ParameterInput *pin, SimTime *tm,
                                 const SignalHandler::OutputSignal signal);
+  void WriteBlocksMetadata_(Mesh *pm, hid_t file, const HDF5::H5P &pl, hsize_t offset,
+                            hsize_t max_blocks_global) const;
+  void WriteCoordinates_(Mesh *pm, const IndexDomain &domain, hid_t file,
+                         const HDF5::H5P &pl, hsize_t offset,
+                         hsize_t max_blocks_global) const;
+  void WriteLevelsAndLocs_(Mesh *pm, hid_t file, const HDF5::H5P &pl, hsize_t offset,
+                           hsize_t max_blocks_global) const;
+  void WriteSparseInfo_(Mesh *pm, hbool_t *sparse_allocated,
+                        const std::vector<std::string> &sparse_names, hsize_t num_sparse,
+                        hid_t file, const HDF5::H5P &pl, size_t offset,
+                        hsize_t max_blocks_global) const;
   const bool restart_; // true if we write a restart file, false for regular output files
-  // TODO(JMM): these methods might want to live in the base class or in output_utils.hpp
-  void ComputeXminBlocks_(Mesh *pm, std::vector<Real> &data);
-  void ComputeLocs_(Mesh *pm, std::vector<int64_t> &locs);
-  void ComputeIDsAndFlags_(Mesh *pm, std::vector<int> &data);
-  void ComputeCoords_(Mesh *pm, bool face, const IndexRange &ib, const IndexRange &jb,
-                      const IndexRange &kb, std::vector<Real> &x, std::vector<Real> &y,
-                      std::vector<Real> &z);
 };
 
 //----------------------------------------------------------------------------------------
diff --git a/src/outputs/parthenon_hdf5.cpp b/src/outputs/parthenon_hdf5.cpp
index 004054d8421a..d7677dfee4fd 100644
--- a/src/outputs/parthenon_hdf5.cpp
+++ b/src/outputs/parthenon_hdf5.cpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2020-2023 The Parthenon collaboration
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -217,19 +217,6 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
     my_offset += nblist[i];
   }
 
-  const std::array<hsize_t, H5_NDIM> local_offset({my_offset, 0, 0, 0, 0, 0, 0});
-
-  // these can vary by data set, except index 0 is always the same
-  std::array<hsize_t, H5_NDIM> local_count(
-      {static_cast<hsize_t>(num_blocks_local), 1, 1, 1, 1, 1, 1});
-  std::array<hsize_t, H5_NDIM> global_count(
-      {static_cast<hsize_t>(max_blocks_global), 1, 1, 1, 1, 1, 1});
-
-  // for convenience
-  const hsize_t *const p_loc_offset = local_offset.data();
-  const hsize_t *const p_loc_cnt = local_count.data();
-  const hsize_t *const p_glob_cnt = global_count.data();
-
   H5P const pl_xfer = H5P::FromHIDCheck(H5Pcreate(H5P_DATASET_XFER));
   H5P const pl_dcreate = H5P::FromHIDCheck(H5Pcreate(H5P_DATASET_CREATE));
 
@@ -240,99 +227,9 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   PARTHENON_HDF5_CHECK(H5Pset_dxpl_mpio(pl_xfer, H5FD_MPIO_COLLECTIVE));
 #endif
 
-  // write Blocks metadata
-  {
-    Kokkos::Profiling::pushRegion("write block metadata");
-    const H5G gBlocks = MakeGroup(file, "/Blocks");
-
-    // write Xmin[ndim] for blocks
-    {
-      std::vector<Real> tmpData(num_blocks_local * 3);
-      ComputeXminBlocks_(pm, tmpData);
-      local_count[1] = global_count[1] = pm->ndim;
-      HDF5Write2D(gBlocks, "xmin", tmpData.data(), p_loc_offset, p_loc_cnt, p_glob_cnt,
-                  pl_xfer);
-    }
-
-    // write Block ID
-    {
-      // LOC.lx1,2,3
-      hsize_t n = 3;
-      std::vector<int64_t> tmpLoc(num_blocks_local * n);
-      local_count[1] = global_count[1] = n;
-      ComputeLocs_(pm, tmpLoc);
-      HDF5Write2D(gBlocks, "loc.lx123", tmpLoc.data(), p_loc_offset, p_loc_cnt,
-                  p_glob_cnt, pl_xfer);
-
-      // (LOC.)level, GID, LID, cnghost, gflag
-      n = 5; // this is NOT H5_NDIM
-      std::vector<int> tmpID(num_blocks_local * n);
-      local_count[1] = global_count[1] = n;
-      ComputeIDsAndFlags_(pm, tmpID);
-      HDF5Write2D(gBlocks, "loc.level-gid-lid-cnghost-gflag", tmpID.data(), p_loc_offset,
-                  p_loc_cnt, p_glob_cnt, pl_xfer);
-    }
-    Kokkos::Profiling::popRegion(); // write block metadata
-  }                                 // Block section
-
-  // Write mesh coordinates to file
-  Kokkos::Profiling::pushRegion("write mesh coords");
-  for (const bool face : {true, false}) {
-    const H5G gLocations = MakeGroup(file, face ? "/Locations" : "/VolumeLocations");
-
-    // write X coordinates
-    std::vector<Real> loc_x((nx1 + face) * num_blocks_local);
-    std::vector<Real> loc_y((nx2 + face) * num_blocks_local);
-    std::vector<Real> loc_z((nx3 + face) * num_blocks_local);
-
-    ComputeCoords_(pm, face, out_ib, out_jb, out_kb, loc_x, loc_y, loc_z);
-
-    local_count[1] = global_count[1] = nx1 + face;
-    HDF5Write2D(gLocations, "x", loc_x.data(), p_loc_offset, p_loc_cnt, p_glob_cnt,
-                pl_xfer);
-
-    local_count[1] = global_count[1] = nx2 + face;
-    HDF5Write2D(gLocations, "y", loc_y.data(), p_loc_offset, p_loc_cnt, p_glob_cnt,
-                pl_xfer);
-
-    local_count[1] = global_count[1] = nx3 + face;
-    HDF5Write2D(gLocations, "z", loc_z.data(), p_loc_offset, p_loc_cnt, p_glob_cnt,
-                pl_xfer);
-  }
-  Kokkos::Profiling::popRegion(); // write mesh coords
-
-  // Write Levels and Logical Locations with the level for each Meshblock loclist contains
-  // levels and logical locations for all meshblocks on all ranks
-  {
-    Kokkos::Profiling::pushRegion("write levels and locations");
-    const auto &loclist = pm->GetLocList();
-
-    std::vector<std::int64_t> levels;
-    levels.reserve(pm->nbtotal);
-
-    std::vector<std::int64_t> logicalLocations;
-    logicalLocations.reserve(pm->nbtotal * 3);
-
-    for (const auto &loc : loclist) {
-      levels.push_back(loc.level() - pm->GetRootLevel());
-      logicalLocations.push_back(loc.lx1());
-      logicalLocations.push_back(loc.lx2());
-      logicalLocations.push_back(loc.lx3());
-    }
-
-    // Only write levels on rank 0 since it has data for all ranks
-    local_count[0] = (Globals::my_rank == 0) ? pm->nbtotal : 0;
-    HDF5WriteND(file, "Levels", levels.data(), 1, local_offset.data(), local_count.data(),
-                global_count.data(), pl_xfer, H5P_DEFAULT);
-
-    local_count[1] = global_count[1] = 3;
-    HDF5Write2D(file, "LogicalLocations", logicalLocations.data(), local_offset.data(),
-                local_count.data(), global_count.data(), pl_xfer);
-
-    // reset for collective output
-    local_count[0] = num_blocks_local;
-    Kokkos::Profiling::popRegion(); // write levels and locations
-  }
+  WriteBlocksMetadata_(pm, file, pl_xfer, my_offset, max_blocks_global);
+  WriteCoordinates_(pm, theDomain, file, pl_xfer, my_offset, max_blocks_global);
+  WriteLevelsAndLocs_(pm, file, pl_xfer, my_offset, max_blocks_global);
 
   // -------------------------------------------------------------------------------- //
   //   WRITING VARIABLES DATA                                                         //
@@ -397,21 +294,13 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   // allocate space for largest size variable
   int varSize_max = 0;
   for (auto &vinfo : all_vars_info) {
-    const int varSize =
-        vinfo.nx6 * vinfo.nx5 * vinfo.nx4 * vinfo.nx3 * vinfo.nx2 * vinfo.nx1;
+    const int varSize = vinfo.Size();
     varSize_max = std::max(varSize_max, varSize);
   }
 
   using OutT = typename std::conditional<WRITE_SINGLE_PRECISION, float, Real>::type;
   std::vector<OutT> tmpData(varSize_max * num_blocks_local);
 
-  // create persistent spaces
-  local_count[0] = num_blocks_local;
-  global_count[0] = max_blocks_global;
-  local_count[4] = global_count[4] = nx3;
-  local_count[5] = global_count[5] = nx2;
-  local_count[6] = global_count[6] = nx1;
-
   // for each variable we write
   for (auto &vinfo : all_vars_info) {
     Kokkos::Profiling::pushRegion("write variable loop");
@@ -423,9 +312,21 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
     const hsize_t nx5 = vinfo.nx5;
     const hsize_t nx4 = vinfo.nx4;
 
-    local_count[1] = global_count[1] = nx6;
-    local_count[2] = global_count[2] = nx5;
-    local_count[3] = global_count[3] = nx4;
+    hsize_t local_offset[H5_NDIM] = {my_offset, 0, 0, 0, 0, 0, 0};
+    hsize_t local_count[H5_NDIM] = {static_cast<hsize_t>(num_blocks_local),
+                                    static_cast<hsize_t>(nx6),
+                                    static_cast<hsize_t>(nx5),
+                                    static_cast<hsize_t>(nx4),
+                                    static_cast<hsize_t>(nx3),
+                                    static_cast<hsize_t>(nx2),
+                                    static_cast<hsize_t>(nx1)};
+    hsize_t global_count[H5_NDIM] = {static_cast<hsize_t>(max_blocks_global),
+                                     static_cast<hsize_t>(nx6),
+                                     static_cast<hsize_t>(nx5),
+                                     static_cast<hsize_t>(nx4),
+                                     static_cast<hsize_t>(nx3),
+                                     static_cast<hsize_t>(nx2),
+                                     static_cast<hsize_t>(nx1)};
 
     std::vector<hsize_t> alldims({nx6, nx5, nx4, static_cast<hsize_t>(vinfo.nx3),
                                   static_cast<hsize_t>(vinfo.nx2),
@@ -496,29 +397,11 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
         // a similar block in parthenon_manager.cpp
         if (v->IsAllocated() && (var_name == v->label())) {
           auto v_h = v->data.GetHostMirrorAndCopy();
-          for (int t = 0; t < nx6; ++t) {
-            for (int u = 0; u < nx5; ++u) {
-              for (int v = 0; v < nx4; ++v) {
-                if (vinfo.where == MetadataFlag(Metadata::Cell)) {
-                  for (int k = out_kb.s; k <= out_kb.e; ++k) {
-                    for (int j = out_jb.s; j <= out_jb.e; ++j) {
-                      for (int i = out_ib.s; i <= out_ib.e; ++i) {
-                        tmpData[index++] = static_cast<OutT>(v_h(t, u, v, k, j, i));
-                      }
-                    }
-                  }
-                } else {
-                  for (int k = 0; k < vinfo.nx3; ++k) {
-                    for (int j = 0; j < vinfo.nx2; ++j) {
-                      for (int i = 0; i < vinfo.nx1; ++i) {
-                        tmpData[index++] = static_cast<OutT>(v_h(t, u, v, k, j, i));
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
+          OutputUtils::PackOrUnpackVar(
+              pmb.get(), v.get(), output_params.include_ghost_zones, index, tmpData,
+              [&](auto index, int t, int u, int v, int k, int j, int i) {
+                tmpData[index] = static_cast<OutT>(v_h(t, u, v, k, j, i));
+              });
 
           is_allocated = true;
           break;
@@ -555,8 +438,8 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
 
     Kokkos::Profiling::pushRegion("write variable data");
     // write data to file
-    HDF5WriteND(file, var_name, tmpData.data(), ndim, p_loc_offset, p_loc_cnt, p_glob_cnt,
-                pl_xfer, pl_dcreate);
+    HDF5WriteND(file, var_name, tmpData.data(), ndim, &local_offset[0], &local_count[0],
+                &global_count[0], pl_xfer, pl_dcreate);
     Kokkos::Profiling::popRegion(); // write variable data
     Kokkos::Profiling::popRegion(); // write variable loop
   }
@@ -593,21 +476,9 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   // write SparseInfo and SparseFields (we can't write a zero-size dataset, so only write
   // this if we have sparse fields)
   if (num_sparse > 0) {
-    Kokkos::Profiling::pushRegion("write sparse info");
-    local_count[1] = global_count[1] = num_sparse;
-
-    HDF5Write2D(file, "SparseInfo", sparse_allocated.get(), p_loc_offset, p_loc_cnt,
-                p_glob_cnt, pl_xfer);
-
-    // write names of sparse fields as attribute, first convert to vector of const char*
-    std::vector<const char *> names(num_sparse);
-    for (size_t i = 0; i < num_sparse; ++i)
-      names[i] = sparse_names[i].c_str();
-
-    const H5D dset = H5D::FromHIDCheck(H5Dopen2(file, "SparseInfo", H5P_DEFAULT));
-    HDF5WriteAttribute("SparseFields", names, dset);
-    Kokkos::Profiling::popRegion(); // write sparse info
-  }                                 // SparseInfo and SparseFields sections
+    WriteSparseInfo_(pm, sparse_allocated.get(), sparse_names, num_sparse, file, pl_xfer,
+                     my_offset, max_blocks_global);
+  } // SparseInfo and SparseFields sections
 
   // -------------------------------------------------------------------------------- //
   //   WRITING PARTICLE DATA                                                          //
@@ -676,13 +547,20 @@ void PHDF5Output::WriteOutputFileImpl(Mesh *pm, ParameterInput *pin, SimTime *tm
   }
   Kokkos::Profiling::popRegion(); // write particle data
 
-  Kokkos::Profiling::pushRegion("genXDMF");
-  // generate XDMF companion file
-  XDMF::genXDMF(filename, pm, tm, nx1, nx2, nx3, all_vars_info, swarm_info);
-  Kokkos::Profiling::popRegion(); // genXDMF
+  if (output_params.write_xdmf) {
+    Kokkos::Profiling::pushRegion("genXDMF");
+    // generate XDMF companion file
+    XDMF::genXDMF(filename, pm, tm, nx1, nx2, nx3, all_vars_info, swarm_info);
+    Kokkos::Profiling::popRegion(); // genXDMF
+  }
 
   Kokkos::Profiling::popRegion(); // WriteOutputFile???Prec
 }
+// explicit template instantiation
+template void PHDF5Output::WriteOutputFileImpl<false>(Mesh *, ParameterInput *, SimTime *,
+                                                      SignalHandler::OutputSignal);
+template void PHDF5Output::WriteOutputFileImpl<true>(Mesh *, ParameterInput *, SimTime *,
+                                                     SignalHandler::OutputSignal);
 
 std::string PHDF5Output::GenerateFilename_(ParameterInput *pin, SimTime *tm,
                                            const SignalHandler::OutputSignal signal) {
@@ -718,164 +596,131 @@ std::string PHDF5Output::GenerateFilename_(ParameterInput *pin, SimTime *tm,
   }
   return filename;
 }
-// TODO(JMM): Should this live in the base class or output_utils?
-void PHDF5Output::ComputeXminBlocks_(Mesh *pm, std::vector<Real> &data) {
-  int i = 0;
-  for (auto &pmb : pm->block_list) {
-    auto xmin = pmb->coords.GetXmin();
-    data[i++] = xmin[0];
-    if (pm->ndim > 1) {
-      data[i++] = xmin[1];
-    }
-    if (pm->ndim > 2) {
-      data[i++] = xmin[2];
-    }
-  }
-}
-// TODO(JMM): Should this live in the base class or output_utils?
-void PHDF5Output::ComputeLocs_(Mesh *pm, std::vector<int64_t> &locs) {
-  int i = 0;
-  for (auto &pmb : pm->block_list) {
-    locs[i++] = pmb->loc.lx1();
-    locs[i++] = pmb->loc.lx2();
-    locs[i++] = pmb->loc.lx3();
+
+void PHDF5Output::WriteBlocksMetadata_(Mesh *pm, hid_t file, const HDF5::H5P &pl,
+                                       hsize_t offset, hsize_t max_blocks_global) const {
+  using namespace HDF5;
+  Kokkos::Profiling::pushRegion("I/O HDF5: write block metadata");
+  const H5G gBlocks = MakeGroup(file, "/Blocks");
+  const hsize_t num_blocks_local = pm->block_list.size();
+  const hsize_t ndim = pm->ndim;
+  const hsize_t loc_offset[2] = {offset, 0};
+
+  // write Xmin[ndim] for blocks
+  {
+    // JMM: These arrays chould be shared, but I think this is clearer
+    // as to what's going on.
+    hsize_t loc_cnt[2] = {num_blocks_local, ndim};
+    hsize_t glob_cnt[2] = {max_blocks_global, ndim};
+
+    std::vector<Real> tmpData = OutputUtils::ComputeXminBlocks(pm);
+    HDF5Write2D(gBlocks, "xmin", tmpData.data(), &loc_offset[0], &loc_cnt[0],
+                &glob_cnt[0], pl);
   }
-}
-// TODO(JMM): Should this live in the base class or output_utils?
-void PHDF5Output::ComputeIDsAndFlags_(Mesh *pm, std::vector<int> &data) {
-  int i = 0;
-  for (auto &pmb : pm->block_list) {
-    data[i++] = pmb->loc.level();
-    data[i++] = pmb->gid;
-    data[i++] = pmb->lid;
-    data[i++] = pmb->cnghost;
-    data[i++] = pmb->gflag;
+
+  {
+    // LOC.lx1,2,3
+    hsize_t loc_cnt[2] = {num_blocks_local, 3};
+    hsize_t glob_cnt[2] = {max_blocks_global, 3};
+    std::vector<int64_t> tmpLoc = OutputUtils::ComputeLocs(pm);
+    HDF5Write2D(gBlocks, "loc.lx123", tmpLoc.data(), &loc_offset[0], &loc_cnt[0],
+                &glob_cnt[0], pl);
   }
-}
-// TODO(JMM): Should this live in the base class or output_utils?
-void PHDF5Output::ComputeCoords_(Mesh *pm, bool face, const IndexRange &ib,
-                                 const IndexRange &jb, const IndexRange &kb,
-                                 std::vector<Real> &x, std::vector<Real> &y,
-                                 std::vector<Real> &z) {
-  std::size_t idx_x = 0, idx_y = 0, idx_z = 0;
-
-  // note relies on casting of bool to int
-  for (auto &pmb : pm->block_list) {
-    for (int i = ib.s; i <= ib.e + face; ++i) {
-      x[idx_x++] = face ? pmb->coords.Xf<1>(i) : pmb->coords.Xc<1>(i);
-    }
-    for (int j = jb.s; j <= jb.e + face; ++j) {
-      y[idx_y++] = face ? pmb->coords.Xf<2>(j) : pmb->coords.Xc<2>(j);
-    }
-    for (int k = kb.s; k <= kb.e + face; ++k) {
-      z[idx_z++] = face ? pmb->coords.Xf<3>(k) : pmb->coords.Xc<3>(k);
-    }
+
+  {
+    // (LOC.)level, GID, LID, cnghost, gflag
+    hsize_t loc_cnt[2] = {num_blocks_local, 5};
+    hsize_t glob_cnt[2] = {max_blocks_global, 5};
+    std::vector<int> tmpID = OutputUtils::ComputeIDsAndFlags(pm);
+    HDF5Write2D(gBlocks, "loc.level-gid-lid-cnghost-gflag", tmpID.data(), &loc_offset[0],
+                &loc_cnt[0], &glob_cnt[0], pl);
   }
+  Kokkos::Profiling::popRegion(); // write block metadata
 }
 
-// explicit template instantiation
-template void PHDF5Output::WriteOutputFileImpl<false>(Mesh *, ParameterInput *, SimTime *,
-                                                      SignalHandler::OutputSignal);
-template void PHDF5Output::WriteOutputFileImpl<true>(Mesh *, ParameterInput *, SimTime *,
-                                                     SignalHandler::OutputSignal);
+void PHDF5Output::WriteCoordinates_(Mesh *pm, const IndexDomain &domain, hid_t file,
+                                    const HDF5::H5P &pl, hsize_t offset,
+                                    hsize_t max_blocks_global) const {
+  using namespace HDF5;
+  Kokkos::Profiling::pushRegion("write mesh coords");
+  const IndexShape &shape = pm->GetLeafBlockCellBounds();
+  const IndexRange ib = shape.GetBoundsI(domain);
+  const IndexRange jb = shape.GetBoundsJ(domain);
+  const IndexRange kb = shape.GetBoundsK(domain);
 
-// Utility functions implemented
-namespace HDF5 {
-std::tuple<int, std::vector<hsize_t>, std::size_t>
-HDF5GetAttributeInfo(hid_t location, const std::string &name, H5A &attr) {
-  // check if attribute exists
-  auto status = PARTHENON_HDF5_CHECK(H5Aexists(location, name.c_str()));
-  PARTHENON_REQUIRE_THROWS(status > 0, "Attribute '" + name + "' does not exist");
-
-  // Open attribute
-  attr = H5A::FromHIDCheck(H5Aopen(location, name.c_str(), H5P_DEFAULT));
-
-  // Get attribute shape
-  const H5S dataspace = H5S::FromHIDCheck(H5Aget_space(attr));
-  int rank = PARTHENON_HDF5_CHECK(H5Sget_simple_extent_ndims(dataspace));
-  std::size_t size = 1;
-  std::vector<hsize_t> dim;
-  if (rank > 0) {
-    dim.resize(rank);
-    PARTHENON_HDF5_CHECK(H5Sget_simple_extent_dims(dataspace, dim.data(), NULL));
-    for (int d = 0; d < rank; ++d) {
-      size *= dim[d];
-    }
-    if (size == 0) {
-      PARTHENON_THROW("Attribute " + name + " has no value");
-    }
-  } else { // scalar quantity
-    dim.resize(1);
-    dim[0] = 1;
-  }
-  // JMM: H5Handle doesn't play nice with returning a tuple/structured
-  // binding, which is why it's not in the tuple. I think the issue is
-  // that H5Handle doesn't have a copy assignment operator, only a
-  // move operator. That probably implies not great things about the
-  // performance of returning the dim array by value here, but
-  // whatever. This isn't performance critical code.
-  return std::make_tuple(rank, dim, size);
-}
+  const hsize_t num_blocks_local = pm->block_list.size();
+  const hsize_t loc_offset[2] = {offset, 0};
+  hsize_t loc_cnt[2] = {num_blocks_local, 1};
+  hsize_t glob_cnt[2] = {max_blocks_global, 1};
 
-// template specializations for std::string and bool
-void HDF5WriteAttribute(const std::string &name, const std::string &value,
-                        hid_t location) {
-  HDF5WriteAttribute(name, value.c_str(), location);
-}
+  for (const bool face : {true, false}) {
+    const H5G gLocations = MakeGroup(file, face ? "/Locations" : "/VolumeLocations");
 
-template <>
-void HDF5WriteAttribute(const std::string &name, const std::vector<std::string> &values,
-                        hid_t location) {
-  std::vector<const char *> char_ptrs(values.size());
-  for (size_t i = 0; i < values.size(); ++i) {
-    char_ptrs[i] = values[i].c_str();
+    std::vector<Real> loc_x, loc_y, loc_z;
+    OutputUtils::ComputeCoords(pm, face, ib, jb, kb, loc_x, loc_y, loc_z);
+
+    loc_cnt[1] = glob_cnt[1] = (ib.e - ib.s + 1) + face;
+    HDF5Write2D(gLocations, "x", loc_x.data(), &loc_offset[0], &loc_cnt[0], &glob_cnt[0],
+                pl);
+
+    loc_cnt[1] = glob_cnt[1] = (jb.e - jb.s + 1) + face;
+    HDF5Write2D(gLocations, "y", loc_y.data(), &loc_offset[0], &loc_cnt[0], &glob_cnt[0],
+                pl);
+
+    loc_cnt[1] = glob_cnt[1] = (kb.e - kb.s + 1) + face;
+    HDF5Write2D(gLocations, "z", loc_z.data(), &loc_offset[0], &loc_cnt[0], &glob_cnt[0],
+                pl);
   }
-  HDF5WriteAttribute(name, char_ptrs, location);
+  Kokkos::Profiling::popRegion(); // write mesh coords
 }
 
-template <>
-std::vector<std::string> HDF5ReadAttributeVec(hid_t location, const std::string &name) {
-  // get strings as char pointers, HDF5 will allocate the memory and we need to free it
-  auto char_ptrs = HDF5ReadAttributeVec<char *>(location, name);
+void PHDF5Output::WriteLevelsAndLocs_(Mesh *pm, hid_t file, const HDF5::H5P &pl,
+                                      hsize_t offset, hsize_t max_blocks_global) const {
+  using namespace HDF5;
+  Kokkos::Profiling::pushRegion("write levels and locations");
+  auto [levels, logicalLocations] = pm->GetLevelsAndLogicalLocationsFlat();
+
+  // Only write levels on rank 0 since it has data for all ranks
+  const hsize_t num_blocks_local = pm->block_list.size();
+  const hsize_t loc_offset[2] = {offset, 0};
+  const hsize_t loc_cnt[2] = {(Globals::my_rank == 0) ? max_blocks_global : 0, 3};
+  const hsize_t glob_cnt[2] = {max_blocks_global, 3};
 
-  // make strings out of char pointers, which copies the memory and then free the memeory
-  std::vector<std::string> res(char_ptrs.size());
-  for (size_t i = 0; i < res.size(); ++i) {
-    res[i] = std::string(char_ptrs[i]);
-    free(char_ptrs[i]);
-  }
+  HDF5Write1D(file, "Levels", levels.data(), &loc_offset[0], &loc_cnt[0], &glob_cnt[0],
+              pl);
+  HDF5Write2D(file, "LogicalLocations", logicalLocations.data(), &loc_offset[0],
+              &loc_cnt[0], &glob_cnt[0], pl);
 
-  return res;
+  Kokkos::Profiling::popRegion(); // write levels and locations
 }
 
-// JMM: A little circular but it works.
-template <>
-std::vector<bool> HDF5ReadAttributeVec(hid_t location, const std::string &name) {
-  HostArray1D<bool> temp;
-  HDF5ReadAttribute(location, name, temp);
-  std::vector<bool> out(temp.size());
-  for (int i = 0; i < temp.size(); ++i) {
-    out[i] = temp[i];
-  }
-  return out;
-}
+void PHDF5Output::WriteSparseInfo_(Mesh *pm, hbool_t *sparse_allocated,
+                                   const std::vector<std::string> &sparse_names,
+                                   hsize_t num_sparse, hid_t file, const HDF5::H5P &pl,
+                                   size_t offset, hsize_t max_blocks_global) const {
+  using namespace HDF5;
+  Kokkos::Profiling::pushRegion("write sparse info");
 
-template <>
-void HDF5WriteAttribute(const std::string &name, const std::vector<bool> &values,
-                        hid_t location) {
-  // can't use std::vector here because std::vector<bool>  doesn't have .data() member
-  std::unique_ptr<hbool_t[]> data(new hbool_t[values.size()]);
-  for (size_t i = 0; i < values.size(); ++i) {
-    data[i] = values[i];
-  }
-  HDF5WriteAttribute(name, values.size(), data.get(), location);
-}
+  const hsize_t num_blocks_local = pm->block_list.size();
+  const hsize_t loc_offset[2] = {offset, 0};
+  const hsize_t loc_cnt[2] = {num_blocks_local, num_sparse};
+  const hsize_t glob_cnt[2] = {max_blocks_global, num_sparse};
+
+  HDF5Write2D(file, "SparseInfo", sparse_allocated, &loc_offset[0], &loc_cnt[0],
+              &glob_cnt[0], pl);
+
+  // write names of sparse fields as attribute, first convert to vector of const char*
+  std::vector<const char *> names(num_sparse);
+  for (size_t i = 0; i < num_sparse; ++i)
+    names[i] = sparse_names[i].c_str();
 
-void HDF5ReadAttribute(hid_t location, const std::string &name, std::string &val) {
-  std::vector<std::string> vec = HDF5ReadAttributeVec<std::string>(location, name);
-  val = vec[0];
+  const H5D dset = H5D::FromHIDCheck(H5Dopen2(file, "SparseInfo", H5P_DEFAULT));
+  HDF5WriteAttribute("SparseFields", names, dset);
+  Kokkos::Profiling::popRegion(); // write sparse info
 }
 
+// Utility functions implemented
+namespace HDF5 {
 hid_t GenerateFileAccessProps() {
 #ifdef MPI_PARALLEL
   /* set the file access template for parallel IO access */
diff --git a/src/outputs/parthenon_hdf5.hpp b/src/outputs/parthenon_hdf5.hpp
index 4b0f9f46cd2c..d51c50025875 100644
--- a/src/outputs/parthenon_hdf5.hpp
+++ b/src/outputs/parthenon_hdf5.hpp
@@ -1,6 +1,6 @@
 //========================================================================================
 // Parthenon performance portable AMR framework
-// Copyright(C) 2020-2022 The Parthenon collaboration
+// Copyright(C) 2020-2024 The Parthenon collaboration
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
 // (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
@@ -18,6 +18,31 @@
 #define OUTPUTS_PARTHENON_HDF5_HPP_
 
 #include "config.hpp"
+
+#include "kokkos_abstraction.hpp"
+#include "parthenon_arrays.hpp"
+
+// JMM: This could probably be done with template magic but I think
+// using a macro is honestly the simplest and cleanest solution here.
+// Template solution would be to define a variatic class to conain the
+// list of types and then a hierarchy of structs/functions to turn
+// that into function calls. Preprocessor seems easier, given we're
+// not manipulating this list in any way.
+#define PARTHENON_ATTR_VALID_VEC_TYPES(T)                                                \
+  T, std::vector<T>, ParArray1D<T>, ParArray2D<T>, ParArray3D<T>, HostArray1D<T>,        \
+      HostArray2D<T>, HostArray3D<T>, Kokkos::View<T *>, Kokkos::View<T **>,             \
+      ParArrayND<T>, ParArrayHost<T>
+// JMM: This is the list of template specializations we
+// "pre-instantiate" We only pre-instantiate device memory, not host
+// memory. The reason is that when building with the Kokkos serial
+// backend, DevMemSpace and HostMemSpace are the same and so this
+// resolves to the same type in the macro, which causes problems.
+#define PARTHENON_ATTR_FOREACH_VECTOR_TYPE(T)                                            \
+  PARTHENON_ATTR_APPLY(T);                                                               \
+  PARTHENON_ATTR_APPLY(Kokkos::View<T *, LayoutWrapper, DevMemSpace>);                   \
+  PARTHENON_ATTR_APPLY(Kokkos::View<T **, LayoutWrapper, DevMemSpace>);                  \
+  PARTHENON_ATTR_APPLY(Kokkos::View<T ***, LayoutWrapper, DevMemSpace>);                 \
+  PARTHENON_ATTR_APPLY(device_view_t<T>)
 // Only proceed if HDF5 output enabled
 #ifdef ENABLE_HDF5
 
@@ -34,104 +59,13 @@
 #include <tuple>
 #include <vector>
 
-#include "kokkos_abstraction.hpp"
+#include "outputs/parthenon_hdf5_types.hpp"
 #include "utils/concepts_lite.hpp"
 #include "utils/error_checking.hpp"
 
 namespace parthenon {
 namespace HDF5 {
 
-// Number of dimension of HDF5 field data sets (block x nv x nu x nt x nz x ny x nx)
-static constexpr size_t H5_NDIM = 7;
-
-static constexpr int OUTPUT_VERSION_FORMAT = 3;
-
-/**
- * @brief RAII handles for HDF5. Use the typedefs directly (e.g. `H5A`, `H5D`, etc.)
- *
- * @tparam CloseFn - function pointer to destructor for HDF5 object
- */
-template <herr_t (*CloseFn)(hid_t)>
-class H5Handle {
- public:
-  H5Handle() = default;
-
-  H5Handle(H5Handle const &) = delete;
-  H5Handle &operator=(H5Handle const &) = delete;
-
-  H5Handle(H5Handle &&other) : hid_(other.Release()) {}
-  H5Handle &operator=(H5Handle &&other) {
-    Reset();
-    hid_ = other.Release();
-    return *this;
-  }
-
-  static H5Handle FromHIDCheck(hid_t const hid) {
-    PARTHENON_REQUIRE_THROWS(hid >= 0, "H5 FromHIDCheck failed");
-
-    H5Handle handle;
-    handle.hid_ = hid;
-    return handle;
-  }
-
-  void Reset() {
-    if (*this) {
-      PARTHENON_HDF5_CHECK(CloseFn(hid_));
-      hid_ = -1;
-    }
-  }
-
-  hid_t Release() {
-    auto hid = hid_;
-    hid_ = -1;
-    return hid;
-  }
-
-  ~H5Handle() { Reset(); }
-
-  // Implicit conversion to hid_t for convenience
-  operator hid_t() const { return hid_; }
-  explicit operator bool() const { return hid_ >= 0; }
-
- private:
-  hid_t hid_ = -1;
-};
-
-using H5A = H5Handle<&H5Aclose>;
-using H5D = H5Handle<&H5Dclose>;
-using H5F = H5Handle<&H5Fclose>;
-using H5G = H5Handle<&H5Gclose>;
-using H5O = H5Handle<&H5Oclose>;
-using H5P = H5Handle<&H5Pclose>;
-using H5T = H5Handle<&H5Tclose>;
-using H5S = H5Handle<&H5Sclose>;
-
-// Static functions to return HDF type
-static hid_t getHDF5Type(const hbool_t *) { return H5T_NATIVE_HBOOL; }
-static hid_t getHDF5Type(const int32_t *) { return H5T_NATIVE_INT32; }
-static hid_t getHDF5Type(const int64_t *) { return H5T_NATIVE_INT64; }
-static hid_t getHDF5Type(const uint32_t *) { return H5T_NATIVE_UINT32; }
-static hid_t getHDF5Type(const uint64_t *) { return H5T_NATIVE_UINT64; }
-static hid_t getHDF5Type(const float *) { return H5T_NATIVE_FLOAT; }
-static hid_t getHDF5Type(const double *) { return H5T_NATIVE_DOUBLE; }
-static hid_t getHDF5Type(const char *) { return H5T_NATIVE_CHAR; }
-
-// On MacOS size_t is "unsigned long" and uint64_t is != "unsigned long".
-// Thus, size_t is not captured by the overload above and needs to selectively enabled.
-template <typename T,
-          typename std::enable_if<std::is_same<T, unsigned long>::value && // NOLINT
-                                      !std::is_same<T, uint64_t>::value,
-                                  bool>::type = true>
-static hid_t getHDF5Type(const T *) {
-  return H5T_NATIVE_ULONG;
-}
-
-static H5T getHDF5Type(const char *const *) {
-  H5T var_string_type = H5T::FromHIDCheck(H5Tcopy(H5T_C_S1));
-  PARTHENON_HDF5_CHECK(H5Tset_size(var_string_type, H5T_VARIABLE));
-  return var_string_type;
-}
-
 //  Implemented in CPP file as it's complex
 hid_t GenerateFileAccessProps();
 
@@ -247,36 +181,6 @@ void HDF5WriteAttribute(const std::string &name, const ParArrayGeneric<D, S> &vi
   return HDF5WriteAttribute(name, view.KokkosView(), location);
 }
 
-std::tuple<int, std::vector<hsize_t>, std::size_t>
-HDF5GetAttributeInfo(hid_t location, const std::string &name, H5A &attr);
-
-template <typename T>
-std::vector<T> HDF5ReadAttributeVec(hid_t location, const std::string &name) {
-  H5A attr;
-  auto [rank, dim, size] = HDF5GetAttributeInfo(location, name, attr);
-  std::vector<T> res(size);
-
-  // Check type
-  auto type = getHDF5Type(res.data());
-  const H5T hdf5_type = H5T::FromHIDCheck(H5Aget_type(attr));
-  auto status = PARTHENON_HDF5_CHECK(H5Tequal(type, hdf5_type));
-  PARTHENON_REQUIRE_THROWS(status > 0, "Type mismatch for attribute " + name);
-
-  // Read data from file
-  PARTHENON_HDF5_CHECK(H5Aread(attr, type, res.data()));
-
-  return res;
-}
-
-// template specialization for std::string (must go into cpp file)
-template <>
-std::vector<std::string> HDF5ReadAttributeVec(hid_t location, const std::string &name);
-
-template <>
-std::vector<bool> HDF5ReadAttributeVec(hid_t location, const std::string &name);
-
-void HDF5ReadAttribute(hid_t location, const std::string &name, std::string &val);
-
 template <typename T, REQUIRES(implements<scalar(T)>::value)>
 void HDF5ReadAttribute(hid_t location, const std::string &name, T &val) {
   auto vec = HDF5ReadAttributeVec<T>(location, name);
@@ -307,7 +211,8 @@ void HDF5ReadAttribute(hid_t location, const std::string &name, T &view) {
   auto *pdata = view.data();
   auto view_h = Kokkos::create_mirror_view(view);
   if constexpr (!std::is_same<typename T::memory_space, Kokkos::HostSpace>::value) {
-    Kokkos::deep_copy(view_h, view);
+    // JMM: I need the pointer to point at host memory. But right now,
+    // only the type of the memory matters, not its contents.
     pdata = view_h.data();
   }
 
@@ -339,6 +244,23 @@ void HDF5ReadAttribute(hid_t location, const std::string &name, std::vector<T> &
   vec = HDF5ReadAttributeVec<T>(location, name);
 }
 
+// Template extern declarations ensuring these are instantiated elsewhere
+#define PARTHENON_ATTR_APPLY(...)                                                        \
+  extern template void HDF5ReadAttribute<__VA_ARGS__>(                                   \
+      hid_t location, const std::string &name, __VA_ARGS__ &val);                        \
+  extern template void HDF5WriteAttribute<__VA_ARGS__>(                                  \
+      const std::string &name, const __VA_ARGS__ &value, hid_t location)
+
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(bool);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(int32_t);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(int64_t);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(uint32_t);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(uint64_t);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(float);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(double);
+
+#undef PARTHENON_ATTR_APPLY
+
 } // namespace HDF5
 } // namespace parthenon
 
diff --git a/src/outputs/parthenon_hdf5_attributes.cpp b/src/outputs/parthenon_hdf5_attributes.cpp
new file mode 100644
index 000000000000..7a917eee1767
--- /dev/null
+++ b/src/outputs/parthenon_hdf5_attributes.cpp
@@ -0,0 +1,141 @@
+//========================================================================================
+// Parthenon performance portable AMR framework
+// Copyright(C) 2020-2023 The Parthenon collaboration
+// Licensed under the 3-clause BSD License, see LICENSE file for details
+//========================================================================================
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+
+#include "config.hpp"
+// Only proceed if HDF5 output enabled
+#ifdef ENABLE_HDF5
+
+// Definitions common to parthenon restart and parthenon output for HDF5
+
+#include <hdf5.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "kokkos_abstraction.hpp"
+#include "utils/concepts_lite.hpp"
+#include "utils/error_checking.hpp"
+
+#include "outputs/parthenon_hdf5.hpp"
+
+namespace parthenon {
+namespace HDF5 {
+
+std::tuple<int, std::vector<hsize_t>, std::size_t>
+HDF5GetAttributeInfo(hid_t location, const std::string &name, H5A &attr) {
+  // check if attribute exists
+  auto status = PARTHENON_HDF5_CHECK(H5Aexists(location, name.c_str()));
+  PARTHENON_REQUIRE_THROWS(status > 0, "Attribute '" + name + "' does not exist");
+
+  // Open attribute
+  attr = H5A::FromHIDCheck(H5Aopen(location, name.c_str(), H5P_DEFAULT));
+
+  // Get attribute shape
+  const H5S dataspace = H5S::FromHIDCheck(H5Aget_space(attr));
+  int rank = PARTHENON_HDF5_CHECK(H5Sget_simple_extent_ndims(dataspace));
+  std::size_t size = 1;
+  std::vector<hsize_t> dim;
+  if (rank > 0) {
+    dim.resize(rank);
+    PARTHENON_HDF5_CHECK(H5Sget_simple_extent_dims(dataspace, dim.data(), NULL));
+    for (int d = 0; d < rank; ++d) {
+      size *= dim[d];
+    }
+    if (size == 0) {
+      PARTHENON_THROW("Attribute " + name + " has no value");
+    }
+  } else { // scalar quantity
+    dim.resize(1);
+    dim[0] = 1;
+  }
+  // JMM: H5Handle doesn't play nice with returning a tuple/structured
+  // binding, which is why it's not in the tuple. I think the issue is
+  // that H5Handle doesn't have a copy assignment operator, only a
+  // move operator. That probably implies not great things about the
+  // performance of returning the dim array by value here, but
+  // whatever. This isn't performance critical code.
+  return std::make_tuple(rank, dim, size);
+}
+
+// template specializations for std::string and bool
+void HDF5WriteAttribute(const std::string &name, const std::string &value,
+                        hid_t location) {
+  HDF5WriteAttribute(name, value.c_str(), location);
+}
+
+template <>
+void HDF5WriteAttribute(const std::string &name, const std::vector<std::string> &values,
+                        hid_t location) {
+  std::vector<const char *> char_ptrs(values.size());
+  for (size_t i = 0; i < values.size(); ++i) {
+    char_ptrs[i] = values[i].c_str();
+  }
+  HDF5WriteAttribute(name, char_ptrs, location);
+}
+
+template <>
+std::vector<std::string> HDF5ReadAttributeVec(hid_t location, const std::string &name) {
+  // get strings as char pointers, HDF5 will allocate the memory and we need to free it
+  auto char_ptrs = HDF5ReadAttributeVec<char *>(location, name);
+
+  // make strings out of char pointers, which copies the memory and then free the memeory
+  std::vector<std::string> res(char_ptrs.size());
+  for (size_t i = 0; i < res.size(); ++i) {
+    res[i] = std::string(char_ptrs[i]);
+    free(char_ptrs[i]);
+  }
+
+  return res;
+}
+
+// JMM: A little circular but it works.
+template <>
+std::vector<bool> HDF5ReadAttributeVec(hid_t location, const std::string &name) {
+  HostArray1D<bool> temp;
+  HDF5ReadAttribute(location, name, temp);
+  std::vector<bool> out(temp.size());
+  for (int i = 0; i < temp.size(); ++i) {
+    out[i] = temp[i];
+  }
+  return out;
+}
+
+template <>
+void HDF5WriteAttribute(const std::string &name, const std::vector<bool> &values,
+                        hid_t location) {
+  // can't use std::vector here because std::vector<bool>  doesn't have .data() member
+  std::unique_ptr<hbool_t[]> data(new hbool_t[values.size()]);
+  for (size_t i = 0; i < values.size(); ++i) {
+    data[i] = values[i];
+  }
+  HDF5WriteAttribute(name, values.size(), data.get(), location);
+}
+
+void HDF5ReadAttribute(hid_t location, const std::string &name, std::string &val) {
+  std::vector<std::string> vec = HDF5ReadAttributeVec<std::string>(location, name);
+  val = vec[0];
+}
+
+} // namespace HDF5
+} // namespace parthenon
+#endif // ENABLE_HDF5
diff --git a/src/outputs/parthenon_hdf5_attributes_read.cpp b/src/outputs/parthenon_hdf5_attributes_read.cpp
new file mode 100644
index 000000000000..f17d5a2498b7
--- /dev/null
+++ b/src/outputs/parthenon_hdf5_attributes_read.cpp
@@ -0,0 +1,61 @@
+//========================================================================================
+// Parthenon performance portable AMR framework
+// Copyright(C) 2020-2023 The Parthenon collaboration
+// Licensed under the 3-clause BSD License, see LICENSE file for details
+//========================================================================================
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+
+#include "config.hpp"
+// Only proceed if HDF5 output enabled
+#ifdef ENABLE_HDF5
+
+// Definitions common to parthenon restart and parthenon output for HDF5
+
+#include <hdf5.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "kokkos_abstraction.hpp"
+#include "utils/concepts_lite.hpp"
+#include "utils/error_checking.hpp"
+
+#include "outputs/parthenon_hdf5.hpp"
+
+namespace parthenon {
+namespace HDF5 {
+
+#define PARTHENON_ATTR_APPLY(...)                                                        \
+  template void HDF5ReadAttribute<__VA_ARGS__>(hid_t location, const std::string &name,  \
+                                               __VA_ARGS__ &val)
+
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(bool);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(int32_t);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(int64_t);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(uint32_t);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(uint64_t);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(float);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(double);
+
+#undef PARTHENON_ATTR_APPLY
+
+} // namespace HDF5
+} // namespace parthenon
+
+#endif // ENABLE_HDF5
diff --git a/src/outputs/parthenon_hdf5_attributes_write.cpp b/src/outputs/parthenon_hdf5_attributes_write.cpp
new file mode 100644
index 000000000000..376f32dedad4
--- /dev/null
+++ b/src/outputs/parthenon_hdf5_attributes_write.cpp
@@ -0,0 +1,61 @@
+//========================================================================================
+// Parthenon performance portable AMR framework
+// Copyright(C) 2020-2023 The Parthenon collaboration
+// Licensed under the 3-clause BSD License, see LICENSE file for details
+//========================================================================================
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+
+#include "config.hpp"
+// Only proceed if HDF5 output enabled
+#ifdef ENABLE_HDF5
+
+// Definitions common to parthenon restart and parthenon output for HDF5
+
+#include <hdf5.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "kokkos_abstraction.hpp"
+#include "utils/concepts_lite.hpp"
+#include "utils/error_checking.hpp"
+
+#include "outputs/parthenon_hdf5.hpp"
+
+namespace parthenon {
+namespace HDF5 {
+
+#define PARTHENON_ATTR_APPLY(...)                                                        \
+  template void HDF5WriteAttribute<__VA_ARGS__>(                                         \
+      const std::string &name, const __VA_ARGS__ &value, hid_t location)
+
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(bool);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(int32_t);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(int64_t);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(uint32_t);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(uint64_t);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(float);
+PARTHENON_ATTR_FOREACH_VECTOR_TYPE(double);
+
+#undef PARTHENON_ATTR_APPLY
+
+} // namespace HDF5
+} // namespace parthenon
+
+#endif // ENABLE_HDF5
diff --git a/src/outputs/parthenon_hdf5_base.hpp b/src/outputs/parthenon_hdf5_base.hpp
new file mode 100644
index 000000000000..62ef8f5bff92
--- /dev/null
+++ b/src/outputs/parthenon_hdf5_base.hpp
@@ -0,0 +1,106 @@
+//========================================================================================
+// Parthenon performance portable AMR framework
+// Copyright(C) 2020-2024 The Parthenon collaboration
+// Licensed under the 3-clause BSD License, see LICENSE file for details
+//========================================================================================
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+#ifndef OUTPUTS_PARTHENON_HDF5_BASE_HPP_
+#define OUTPUTS_PARTHENON_HDF5_BASE_HPP_
+
+#include "config.hpp"
+
+#ifdef ENABLE_HDF5
+
+// Definitions common to parthenon restart and parthenon output for HDF5
+
+#include <hdf5.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "utils/concepts_lite.hpp"
+#include "utils/error_checking.hpp"
+
+namespace parthenon {
+namespace HDF5 {
+/**
+ * @brief RAII handles for HDF5. Use the typedefs directly (e.g. `H5A`, `H5D`, etc.)
+ *
+ * @tparam CloseFn - function pointer to destructor for HDF5 object
+ */
+template <herr_t (*CloseFn)(hid_t)>
+class H5Handle {
+ public:
+  H5Handle() = default;
+
+  H5Handle(H5Handle const &) = delete;
+  H5Handle &operator=(H5Handle const &) = delete;
+
+  H5Handle(H5Handle &&other) : hid_(other.Release()) {}
+  H5Handle &operator=(H5Handle &&other) {
+    Reset();
+    hid_ = other.Release();
+    return *this;
+  }
+
+  static H5Handle FromHIDCheck(hid_t const hid) {
+    PARTHENON_REQUIRE_THROWS(hid >= 0, "H5 FromHIDCheck failed");
+
+    H5Handle handle;
+    handle.hid_ = hid;
+    return handle;
+  }
+
+  void Reset() {
+    if (*this) {
+      PARTHENON_HDF5_CHECK(CloseFn(hid_));
+      hid_ = -1;
+    }
+  }
+
+  hid_t Release() {
+    auto hid = hid_;
+    hid_ = -1;
+    return hid;
+  }
+
+  ~H5Handle() { Reset(); }
+
+  // Implicit conversion to hid_t for convenience
+  operator hid_t() const { return hid_; }
+  explicit operator bool() const { return hid_ >= 0; }
+
+ private:
+  hid_t hid_ = -1;
+};
+
+using H5A = H5Handle<&H5Aclose>;
+using H5D = H5Handle<&H5Dclose>;
+using H5F = H5Handle<&H5Fclose>;
+using H5G = H5Handle<&H5Gclose>;
+using H5O = H5Handle<&H5Oclose>;
+using H5P = H5Handle<&H5Pclose>;
+using H5T = H5Handle<&H5Tclose>;
+using H5S = H5Handle<&H5Sclose>;
+
+} // namespace HDF5
+} // namespace parthenon
+
+#endif // ENABLE_HDF5
+#endif // OUTPUTS_PARTHENON_HDF5_BASE_HPP_
diff --git a/src/outputs/parthenon_hdf5_types.hpp b/src/outputs/parthenon_hdf5_types.hpp
new file mode 100644
index 000000000000..e9f771d50619
--- /dev/null
+++ b/src/outputs/parthenon_hdf5_types.hpp
@@ -0,0 +1,170 @@
+//========================================================================================
+// Parthenon performance portable AMR framework
+// Copyright(C) 2020-2024 The Parthenon collaboration
+// Licensed under the 3-clause BSD License, see LICENSE file for details
+//========================================================================================
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+#ifndef OUTPUTS_PARTHENON_HDF5_TYPES_HPP_
+#define OUTPUTS_PARTHENON_HDF5_TYPES_HPP_
+
+#include "config.hpp"
+
+#ifdef ENABLE_HDF5
+
+// Definitions common to parthenon restart and parthenon output for HDF5
+
+#include <hdf5.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "utils/error_checking.hpp"
+
+namespace parthenon {
+namespace HDF5 {
+
+// Number of dimension of HDF5 field data sets (block x nv x nu x nt x nz x ny x nx)
+static constexpr size_t H5_NDIM = 7;
+
+static constexpr int OUTPUT_VERSION_FORMAT = 3;
+
+/**
+ * @brief RAII handles for HDF5. Use the typedefs directly (e.g. `H5A`, `H5D`, etc.)
+ *
+ * @tparam CloseFn - function pointer to destructor for HDF5 object
+ */
+template <herr_t (*CloseFn)(hid_t)>
+class H5Handle {
+ public:
+  H5Handle() = default;
+
+  H5Handle(H5Handle const &) = delete;
+  H5Handle &operator=(H5Handle const &) = delete;
+
+  H5Handle(H5Handle &&other) : hid_(other.Release()) {}
+  H5Handle &operator=(H5Handle &&other) {
+    Reset();
+    hid_ = other.Release();
+    return *this;
+  }
+
+  static H5Handle FromHIDCheck(hid_t const hid) {
+    PARTHENON_REQUIRE_THROWS(hid >= 0, "H5 FromHIDCheck failed");
+
+    H5Handle handle;
+    handle.hid_ = hid;
+    return handle;
+  }
+
+  void Reset() {
+    if (*this) {
+      PARTHENON_HDF5_CHECK(CloseFn(hid_));
+      hid_ = -1;
+    }
+  }
+
+  hid_t Release() {
+    auto hid = hid_;
+    hid_ = -1;
+    return hid;
+  }
+
+  ~H5Handle() { Reset(); }
+
+  // Implicit conversion to hid_t for convenience
+  operator hid_t() const { return hid_; }
+  explicit operator bool() const { return hid_ >= 0; }
+
+ private:
+  hid_t hid_ = -1;
+};
+
+using H5A = H5Handle<&H5Aclose>;
+using H5D = H5Handle<&H5Dclose>;
+using H5F = H5Handle<&H5Fclose>;
+using H5G = H5Handle<&H5Gclose>;
+using H5O = H5Handle<&H5Oclose>;
+using H5P = H5Handle<&H5Pclose>;
+using H5T = H5Handle<&H5Tclose>;
+using H5S = H5Handle<&H5Sclose>;
+
+// Static functions to return HDF type
+static hid_t getHDF5Type(const hbool_t *) { return H5T_NATIVE_HBOOL; }
+static hid_t getHDF5Type(const int32_t *) { return H5T_NATIVE_INT32; }
+static hid_t getHDF5Type(const int64_t *) { return H5T_NATIVE_INT64; }
+static hid_t getHDF5Type(const uint32_t *) { return H5T_NATIVE_UINT32; }
+static hid_t getHDF5Type(const uint64_t *) { return H5T_NATIVE_UINT64; }
+static hid_t getHDF5Type(const float *) { return H5T_NATIVE_FLOAT; }
+static hid_t getHDF5Type(const double *) { return H5T_NATIVE_DOUBLE; }
+static hid_t getHDF5Type(const char *) { return H5T_NATIVE_CHAR; }
+
+// On MacOS size_t is "unsigned long" and uint64_t is != "unsigned long".
+// Thus, size_t is not captured by the overload above and needs to selectively enabled.
+template <typename T,
+          typename std::enable_if<std::is_same<T, unsigned long>::value && // NOLINT
+                                      !std::is_same<T, uint64_t>::value,
+                                  bool>::type = true>
+static hid_t getHDF5Type(const T *) {
+  return H5T_NATIVE_ULONG;
+}
+
+static H5T getHDF5Type(const char *const *) {
+  H5T var_string_type = H5T::FromHIDCheck(H5Tcopy(H5T_C_S1));
+  PARTHENON_HDF5_CHECK(H5Tset_size(var_string_type, H5T_VARIABLE));
+  return var_string_type;
+}
+
+// JMM: This stuff is here, not in the rest of the
+// attributes code for crazy reasons involving the restart reader
+// and compile times.
+std::tuple<int, std::vector<hsize_t>, std::size_t>
+HDF5GetAttributeInfo(hid_t location, const std::string &name, H5A &attr);
+
+void HDF5ReadAttribute(hid_t location, const std::string &name, std::string &val);
+
+template <typename T>
+std::vector<T> HDF5ReadAttributeVec(hid_t location, const std::string &name) {
+  H5A attr;
+  auto [rank, dim, size] = HDF5GetAttributeInfo(location, name, attr);
+  std::vector<T> res(size);
+
+  // Check type
+  auto type = getHDF5Type(res.data());
+  const H5T hdf5_type = H5T::FromHIDCheck(H5Aget_type(attr));
+  auto status = PARTHENON_HDF5_CHECK(H5Tequal(type, hdf5_type));
+  PARTHENON_REQUIRE_THROWS(status > 0, "Type mismatch for attribute " + name);
+
+  // Read data from file
+  PARTHENON_HDF5_CHECK(H5Aread(attr, type, res.data()));
+
+  return res;
+}
+
+// template specialization for std::string (must go into cpp file)
+template <>
+std::vector<std::string> HDF5ReadAttributeVec(hid_t location, const std::string &name);
+
+template <>
+std::vector<bool> HDF5ReadAttributeVec(hid_t location, const std::string &name);
+
+} // namespace HDF5
+} // namespace parthenon
+
+#endif // ENABLE_HDF5
+#endif // OUTPUTS_PARTHENON_HDF5_TYPES_HPP_
diff --git a/src/outputs/restart.hpp b/src/outputs/restart.hpp
index d7ba1c167fef..69c301d30702 100644
--- a/src/outputs/restart.hpp
+++ b/src/outputs/restart.hpp
@@ -30,7 +30,7 @@
 #include <hdf5.h>
 
 #include "interface/metadata.hpp"
-#include "outputs/parthenon_hdf5.hpp"
+#include "outputs/parthenon_hdf5_types.hpp"
 
 using namespace parthenon::HDF5;
 // TODO(someone) the following "else" is very ugly but fixes missing types when not
diff --git a/src/parthenon/driver.hpp b/src/parthenon/driver.hpp
index 60dd5f3bd7b5..eced2c6684d3 100644
--- a/src/parthenon/driver.hpp
+++ b/src/parthenon/driver.hpp
@@ -26,9 +26,7 @@
 #include <mesh/meshblock_pack.hpp>
 #include <outputs/outputs.hpp>
 #include <parameter_input.hpp>
-#include <tasks/task_id.hpp>
-#include <tasks/task_list.hpp>
-#include <tasks/task_types.hpp>
+#include <tasks/tasks.hpp>
 #include <utils/partition_stl_containers.hpp>
 #include <utils/reductions.hpp>
 #include <utils/unique_id.hpp>
@@ -66,6 +64,7 @@ using ::parthenon::TaskCollection;
 using ::parthenon::TaskID;
 using ::parthenon::TaskList;
 using ::parthenon::TaskListStatus;
+using ::parthenon::TaskQualifier;
 using ::parthenon::TaskRegion;
 using ::parthenon::TaskStatus;
 using ::parthenon::TaskType;
diff --git a/src/parthenon/package.hpp b/src/parthenon/package.hpp
index 5d80c6b13262..9da6521da94b 100644
--- a/src/parthenon/package.hpp
+++ b/src/parthenon/package.hpp
@@ -31,6 +31,7 @@
 #include <mesh/meshblock_pack.hpp>
 #include <parameter_input.hpp>
 #include <parthenon_manager.hpp>
+#include <utils/index_split.hpp>
 #include <utils/partition_stl_containers.hpp>
 
 // Local Includes
@@ -46,6 +47,7 @@ using ::parthenon::ApplicationInput;
 using ::parthenon::BlockList_t;
 using ::parthenon::DevExecSpace;
 using ::parthenon::HostExecSpace;
+using ::parthenon::IndexSplit;
 using ::parthenon::Mesh;
 using ::parthenon::MeshBlock;
 using ::parthenon::MeshBlockPack;
diff --git a/src/parthenon/prelude.hpp b/src/parthenon/prelude.hpp
index 8fafb2c94204..6f28cdcb9ee0 100644
--- a/src/parthenon/prelude.hpp
+++ b/src/parthenon/prelude.hpp
@@ -33,6 +33,7 @@ namespace prelude {
 using ::parthenon::BoundaryCommSubset;
 using ::parthenon::IndexDomain;
 using ::parthenon::IndexRange;
+using ::parthenon::KokkosTimer;
 using ::parthenon::MeshBlock;
 using ::parthenon::MeshBlockData;
 using ::parthenon::MeshData;
diff --git a/src/tasks/task_id.hpp b/src/parthenon_arrays.cpp
similarity index 51%
rename from src/tasks/task_id.hpp
rename to src/parthenon_arrays.cpp
index 942cf0a4e5ed..5a946d14a89a 100644
--- a/src/tasks/task_id.hpp
+++ b/src/parthenon_arrays.cpp
@@ -1,5 +1,9 @@
 //========================================================================================
-// (C) (or copyright) 2020. Triad National Security, LLC. All rights reserved.
+// Athena++ astrophysical MHD code
+// Copyright(C) 2014 James M. Stone <jmstone@princeton.edu> and other code contributors
+// Licensed under the 3-clause BSD License, see LICENSE file for details
+//========================================================================================
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -11,43 +15,16 @@
 // the public, perform publicly and display publicly, and to permit others to do so.
 //========================================================================================
 
-#ifndef TASKS_TASK_ID_HPP_
-#define TASKS_TASK_ID_HPP_
-
-#include <bitset>
-#include <string>
-#include <vector>
-
+#include "parthenon_arrays.hpp"
 #include "basic_types.hpp"
 
 namespace parthenon {
 
-//----------------------------------------------------------------------------------------
-//! \class TaskID
-//  \brief generalization of bit fields for Task IDs, status, and dependencies.
+#define PARTHENON_ARRAY_SPEC(T)                                                          \
+  template class ParArrayGeneric<device_view_t<T, LayoutWrapper>, empty_state_t>
 
-#define BITBLOCK 64
+PARTHENON_ARRAY_SPEC(Real);
 
-class TaskID {
- public:
-  TaskID() : nbits_set(0), bit(-1) { Set(0); }
-  explicit TaskID(int id);
-
-  void Set(int id);
-  void clear();
-  bool CheckDependencies(const TaskID &rhs) const;
-  void SetFinished(const TaskID &rhs);
-  bool operator==(const TaskID &rhs) const;
-  bool operator!=(const TaskID &rhs) const;
-  TaskID operator|(const TaskID &rhs) const;
-  std::string to_string() const;
-
- private:
-  int nbits_set; 
-  int bit;
-  std::vector<uint64_t> bitblocks;
-};
+#undef PARTHENON_ARRAY_SPEC
 
 } // namespace parthenon
-
-#endif // TASKS_TASK_ID_HPP_
diff --git a/src/parthenon_arrays.hpp b/src/parthenon_arrays.hpp
index 5f7f4672274e..bba59f76f56b 100644
--- a/src/parthenon_arrays.hpp
+++ b/src/parthenon_arrays.hpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2014 James M. Stone <jmstone@princeton.edu> and other code contributors
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -35,23 +35,21 @@
 #define PARARRAY_TEMP                                                                    \
   "ParArrayND:" + std::string(__FILE__) + ":" + std::to_string(__LINE__)
 
-inline constexpr std::size_t MAX_VARIABLE_DIMENSION = 7;
-
 namespace parthenon {
 
-template <typename T, typename Layout = LayoutWrapper>
-using device_view_t =
-    Kokkos::View<multi_pointer_t<T, MAX_VARIABLE_DIMENSION>, Layout, DevMemSpace>;
-
-template <typename T, typename Layout = LayoutWrapper>
-using host_view_t = typename device_view_t<T, Layout>::HostMirror;
-
 template <typename T, typename State = empty_state_t, typename Layout = LayoutWrapper>
 using ParArrayND = ParArrayGeneric<device_view_t<T, Layout>, State>;
 
 template <typename T, typename State = empty_state_t, typename Layout = LayoutWrapper>
 using ParArrayHost = ParArrayGeneric<host_view_t<T, Layout>, State>;
 
+#define PARTHENON_ARRAY_DECL(T)                                                          \
+  extern template class ParArrayGeneric<device_view_t<T, LayoutWrapper>, empty_state_t>
+
+PARTHENON_ARRAY_DECL(Real);
+
+#undef PARTHENON_ARRAY_DECL
+
 } // namespace parthenon
 
 #endif // PARTHENON_ARRAYS_HPP_
diff --git a/src/parthenon_manager.cpp b/src/parthenon_manager.cpp
index 2329553bfe76..814a8c6f094f 100644
--- a/src/parthenon_manager.cpp
+++ b/src/parthenon_manager.cpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2020-2023 The Parthenon collaboration
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -337,6 +337,8 @@ void ParthenonManager::RestartPackages(Mesh &rm, RestartReader &resfile) {
       // Double note that this also needs to be update in case
       // we update the HDF5 infrastructure!
       if (file_output_format_ver == -1) {
+        PARTHENON_WARN("This file output format version is deprecrated and will be "
+                       "removed in a future release.");
         for (int k = out_kb.s; k <= out_kb.e; ++k) {
           for (int j = out_jb.s; j <= out_jb.e; ++j) {
             for (int i = out_ib.s; i <= out_ib.e; ++i) {
@@ -348,19 +350,9 @@ void ParthenonManager::RestartPackages(Mesh &rm, RestartReader &resfile) {
         }
       } else if (file_output_format_ver == 2 ||
                  file_output_format_ver == HDF5::OUTPUT_VERSION_FORMAT) {
-        for (int t = 0; t < Nt; ++t) {
-          for (int u = 0; u < Nu; ++u) {
-            for (int v = 0; v < Nv; ++v) {
-              for (int k = out_kb.s; k <= out_kb.e; ++k) {
-                for (int j = out_jb.s; j <= out_jb.e; ++j) {
-                  for (int i = out_ib.s; i <= out_ib.e; ++i) {
-                    v_h(t, u, v, k, j, i) = tmp[index++];
-                  }
-                }
-              }
-            }
-          }
-        }
+        OutputUtils::PackOrUnpackVar(pmb.get(), v.get(), resfile.hasGhost, index, tmp,
+                                     [&](auto index, int t, int u, int v, int k, int j,
+                                         int i) { v_h(t, u, v, k, j, i) = tmp[index]; });
       } else {
         PARTHENON_THROW("Unknown output format version in restart file.")
       }
@@ -389,9 +381,8 @@ void ParthenonManager::RestartPackages(Mesh &rm, RestartReader &resfile) {
     std::size_t block_index = 0;
     // only want to do this once per block
     for (auto &pmb : rm.block_list) {
-      ParArrayND<int> new_indices;
       auto pswarm_blk = (pmb->swarm_data.Get())->Get(swarmname);
-      pswarm_blk->AddEmptyParticles(counts[block_index], new_indices);
+      pswarm_blk->AddEmptyParticles(counts[block_index]);
       block_index++;
     }
     ReadSwarmVars_<int>(swarm, rm.block_list, count_on_rank, offsets[0]);
diff --git a/src/pgen/default_pgen.cpp b/src/pgen/default_pgen.cpp
index fc745dd008d8..6ceecc2db998 100644
--- a/src/pgen/default_pgen.cpp
+++ b/src/pgen/default_pgen.cpp
@@ -3,7 +3,7 @@
 // Copyright(C) 2014 James M. Stone <jmstone@princeton.edu> and other code contributors
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -110,7 +110,7 @@ void MeshBlock::InitMeshBlockUserDataDefault(MeshBlock *pmb, ParameterInput *pin
 }
 
 //========================================================================================
-//! \fn void MeshBlock::ProblemGeneratorDefault(ParameterInput *pin)
+//! \fn void MeshBlock::ProblemGeneratorDefault(MeshBlock *pmb, ParameterInput *pin)
 //  \brief Should be used to set initial conditions.
 //========================================================================================
 
@@ -120,6 +120,23 @@ void MeshBlock::ProblemGeneratorDefault(MeshBlock *pmb, ParameterInput *pin) {
   return;
 }
 
+//========================================================================================
+//! \fn void Mesh::UserMeshWorkBeforeOutputDefault(Mesh *pmb, ParameterInput *pin, SimTime
+//! &t)
+//  \brief Function called before generating output files
+//========================================================================================
+
+void Mesh::UserMeshWorkBeforeOutputDefault(Mesh *, ParameterInput *, SimTime const &) {
+  // do nothing
+  return;
+}
+
+//! \fn void MeshBlock::PostInitializationDefault(MeshBlock *pmb, ParameterInput *pin)
+//  \brief Should be used to perform post initialization ops.
+//========================================================================================
+
+void MeshBlock::PostInitializationDefault(MeshBlock *pmb, ParameterInput *pin) { return; }
+
 //========================================================================================
 //! \fn void MeshBlock::UserWorkBeforeOutputDefault(MeshBlock *pmb, ParameterInput *pin)
 //  \brief Function called before generating output files
diff --git a/src/prolong_restrict/pr_loops.hpp b/src/prolong_restrict/pr_loops.hpp
index 6f9cf823bc37..288b18ed60a6 100644
--- a/src/prolong_restrict/pr_loops.hpp
+++ b/src/prolong_restrict/pr_loops.hpp
@@ -113,6 +113,7 @@ inline void
 ProlongationRestrictionLoop(const ProResInfoArr_t &info, const Idx_t &buffer_idxs,
                             const IndexShape &cellbounds, const IndexShape &c_cellbounds,
                             const RefinementOp_t op, const std::size_t nbuffers) {
+  PARTHENON_INSTRUMENT
   const IndexDomain interior = IndexDomain::interior;
   auto ckb = c_cellbounds.GetBoundsK(interior);
   auto cjb = c_cellbounds.GetBoundsJ(interior);
@@ -123,8 +124,8 @@ ProlongationRestrictionLoop(const ProResInfoArr_t &info, const Idx_t &buffer_idx
   const int scratch_level = 1; // 0 is actual scratch (tiny); 1 is HBM
   size_t scratch_size_in_bytes = 1;
   par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "ProlongateOrRestrictCellCenteredValues",
-      DevExecSpace(), scratch_size_in_bytes, scratch_level, 0, nbuffers - 1,
+      DEFAULT_OUTER_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(),
+      scratch_size_in_bytes, scratch_level, 0, nbuffers - 1,
       KOKKOS_LAMBDA(team_mbr_t team_member, const int sub_idx) {
         const std::size_t buf = buffer_idxs(sub_idx);
         if (DoRefinementOp(info(buf), op)) {
@@ -151,14 +152,15 @@ InnerHostProlongationRestrictionLoop(std::size_t buf, const ProResInfoArrHost_t
                                      const IndexRange &ckb, const IndexRange &cjb,
                                      const IndexRange &cib, const IndexRange &kb,
                                      const IndexRange &jb, const IndexRange &ib) {
+  PARTHENON_INSTRUMENT
   const auto &idxer = info(buf).idxer[static_cast<int>(CEL)];
   auto coords = info(buf).coords;
   auto coarse_coords = info(buf).coarse_coords;
   auto coarse = info(buf).coarse;
   auto fine = info(buf).fine;
   par_for(
-      DEFAULT_LOOP_PATTERN, "ProlongateOrRestrictCellCenteredValues", DevExecSpace(), 0,
-      0, 0, 0, 0, idxer.size() - 1, KOKKOS_LAMBDA(const int, const int, const int ii) {
+      DEFAULT_LOOP_PATTERN, PARTHENON_AUTO_LABEL, DevExecSpace(), 0, 0, 0, 0, 0,
+      idxer.size() - 1, KOKKOS_LAMBDA(const int, const int, const int ii) {
         const auto [t, u, v, k, j, i] = idxer(ii);
         if (idxer.IsActive(k, j, i)) {
           Stencil::template Do<DIM, FEL, CEL>(t, u, v, k, j, i, ckb, cjb, cib, kb, jb, ib,
diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index aa261f5fbde4..951a4c78d7f8 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -24,8 +24,8 @@
 #include "kokkos_abstraction.hpp"
 #include "solvers/mg_solver.hpp"
 #include "solvers/solver_utils.hpp"
-#include "tasks/task_id.hpp"
-#include "tasks/task_list.hpp"
+
+#include "tasks/tasks.hpp"
 
 namespace parthenon {
 
@@ -92,13 +92,12 @@ class BiCGSTABSolver {
     return preconditioner.AddSetupTasks(region, tl, dependence, partition, reg_dep_id, pmesh);
   }
 
-  TaskID AddTasks(TaskList &tl, IterativeTasks &itl, TaskID dependence, int i,
-                  Mesh *pmesh, TaskRegion &region, int &reg_dep_id) {
+  TaskID AddTasks(TaskList &tl, TaskID dependence, Mesh *pmesh, const int partition) {
     using namespace utils;
-    auto &md = pmesh->mesh_data.GetOrAdd("base", i);
-    std::string label = "bicg_comm_" + std::to_string(i);
+    TaskID none;
+    auto &md = pmesh->mesh_data.GetOrAdd("base", partition);
+    std::string label = "bicg_comm_" + std::to_string(partition);
     auto &md_comm = pmesh->mesh_data.AddShallow(label, md, std::vector<std::string>{u::name()});
-
     iter_counter = 0;
     bool multilevel = pmesh->multilevel;
 
@@ -109,12 +108,11 @@ class BiCGSTABSolver {
     auto copy_r = tl.AddTask(dependence, CopyData<rhs, r>, md);
     auto copy_p = tl.AddTask(dependence, CopyData<rhs, p>, md);
     auto copy_rhat0 = tl.AddTask(dependence, CopyData<rhs, rhat0>, md);
-    auto get_rhat0r_init =
-        DotProduct<rhat0, r>(dependence, region, tl, i, reg_dep_id, &rhat0r, md);
+    auto get_rhat0r_init = DotProduct<rhat0, r>(dependence, tl, &rhat0r, md);
     auto initialize = tl.AddTask(
+        TaskQualifier::once_per_region | TaskQualifier::local_sync,
         zero_x | zero_u_init | copy_r | copy_p | copy_rhat0 | get_rhat0r_init,
-        [](BiCGSTABSolver *solver, int partition) {
-          if (partition != 0) return TaskStatus::complete;
+        [](BiCGSTABSolver *solver) {
           solver->rhat0r_old = solver->rhat0r.val;
           solver->rhat0r.val = 0.0;
           solver->rhat0v.val = 0.0;
@@ -123,28 +121,25 @@ class BiCGSTABSolver {
           solver->residual.val = 0.0;
           return TaskStatus::complete;
         },
-        this, i);
-    region.AddRegionalDependencies(reg_dep_id, i, initialize);
-    reg_dep_id++;
-    if (i == 0 && params_.print_per_step) {
-      tl.AddTask(dependence, [&]() {
-        if (Globals::my_rank == 0)
-          printf("# [0] v-cycle\n# [1] rms-residual\n# [2] rms-error\n");
-        return TaskStatus::complete;
-      });
-    }
+        this);
+    tl.AddTask(TaskQualifier::once_per_region, dependence, [&]() {
+      if (Globals::my_rank == 0 && params_.print_per_step)
+        printf("# [0] v-cycle\n# [1] rms-residual\n# [2] rms-error\n");
+      return TaskStatus::complete;
+    });
 
     // BEGIN ITERATIVE TASKS
+    auto [itl, solver_id] = tl.AddSublist(initialize, {1, params_.max_iters});
 
     // 1. u <- M p
-    auto precon1 = initialize;
+    auto precon1 = none;
     if (params_.precondition) {
       auto set_rhs = itl.AddTask(precon1, CopyData<p, rhs>, md);
       auto zero_u = itl.AddTask(precon1, SetToZero<u>, md);
-      precon1 = preconditioner.AddLinearOperatorTasks(region, itl, set_rhs | zero_u, i,
-                                                      reg_dep_id, pmesh);
+      precon1 =
+          preconditioner.AddLinearOperatorTasks(itl, set_rhs | zero_u, partition, pmesh);
     } else {
-      precon1 = itl.AddTask(initialize, CopyData<p, u>, md);
+      precon1 = itl.AddTask(none, CopyData<p, u>, md);
     }
 
     // 2. v <- A u
@@ -152,8 +147,7 @@ class BiCGSTABSolver {
     auto get_v = eqs_.template Ax<u, v>(itl, comm, md);
 
     // 3. rhat0v <- (rhat0, v)
-    auto get_rhat0v =
-        DotProduct<rhat0, v>(get_v, region, itl, i, reg_dep_id, &rhat0v, md);
+    auto get_rhat0v = DotProduct<rhat0, v>(get_v, itl, &rhat0v, md);
 
     // 4. h <- x + alpha u (alpha = rhat0r_old / rhat0v)
     auto correct_h = itl.AddTask(
@@ -174,26 +168,25 @@ class BiCGSTABSolver {
         this, md);
 
     // Check and print out residual
-    auto get_res = DotProduct<s, s>(correct_s, region, itl, i, reg_dep_id, &residual, md);
+    auto get_res = DotProduct<s, s>(correct_s, itl, &residual, md);
 
     auto print = itl.AddTask(
-        get_res,
-        [&](BiCGSTABSolver *solver, Mesh *pmesh, int partition) {
-          if (partition != 0) return TaskStatus::complete;
+        TaskQualifier::once_per_region, get_res,
+        [&](BiCGSTABSolver *solver, Mesh *pmesh) {
           Real rms_res = std::sqrt(solver->residual.val / pmesh->GetTotalCells());
           if (Globals::my_rank == 0 && solver->params_.print_per_step)
             printf("%i %e\n", solver->iter_counter * 2 + 1, rms_res);
           return TaskStatus::complete;
         },
-        this, pmesh, i);
+        this, pmesh);
 
     // 6. u <- M s
     auto precon2 = correct_s;
     if (params_.precondition) {
       auto set_rhs = itl.AddTask(precon2, CopyData<s, rhs>, md);
       auto zero_u = itl.AddTask(precon2, SetToZero<u>, md);
-      precon2 = preconditioner.AddLinearOperatorTasks(region, itl, set_rhs | zero_u, i,
-                                                      reg_dep_id, pmesh);
+      precon2 =
+          preconditioner.AddLinearOperatorTasks(itl, set_rhs | zero_u, partition, pmesh);
     } else {
       precon2 = itl.AddTask(precon2, CopyData<s, u>, md);
     }
@@ -203,12 +196,12 @@ class BiCGSTABSolver {
     auto get_t = eqs_.template Ax<u, t>(itl, pre_t_comm, md);
 
     // 8. omega <- (t,s) / (t,t)
-    auto get_ts = DotProduct<t, s>(get_t, region, itl, i, reg_dep_id, &ts, md);
-    auto get_tt = DotProduct<t, t>(get_t, region, itl, i, reg_dep_id, &tt, md);
+    auto get_ts = DotProduct<t, s>(get_t, itl, &ts, md);
+    auto get_tt = DotProduct<t, t>(get_t, itl, &tt, md);
 
     // 9. x <- h + omega u
     auto correct_x = itl.AddTask(
-        get_tt | get_ts,
+        TaskQualifier::local_sync, get_tt | get_ts,
         [](BiCGSTABSolver *solver, std::shared_ptr<MeshData<Real>> &md) {
           Real omega = solver->ts.val / solver->tt.val;
           return AddFieldsAndStore<h, u, x>(md, 1.0, omega);
@@ -225,29 +218,25 @@ class BiCGSTABSolver {
         this, md);
 
     // Check and print out residual
-    auto get_res2 =
-        DotProduct<r, r>(correct_r, region, itl, i, reg_dep_id, &residual, md);
-
-    if (i == 0) {
-      get_res2 = itl.AddTask(
-          get_res2,
-          [&](BiCGSTABSolver *solver, Mesh *pmesh) {
-            Real rms_err = std::sqrt(solver->residual.val / pmesh->GetTotalCells());
-            if (Globals::my_rank == 0 && solver->params_.print_per_step)
-              printf("%i %e\n", solver->iter_counter * 2 + 2, rms_err);
-            return TaskStatus::complete;
-          },
-          this, pmesh);
-    }
+    auto get_res2 = DotProduct<r, r>(correct_r, itl, &residual, md);
+
+    get_res2 = itl.AddTask(
+        TaskQualifier::once_per_region, get_res2,
+        [&](BiCGSTABSolver *solver, Mesh *pmesh) {
+          Real rms_err = std::sqrt(solver->residual.val / pmesh->GetTotalCells());
+          if (Globals::my_rank == 0 && solver->params_.print_per_step)
+            printf("%i %e\n", solver->iter_counter * 2 + 2, rms_err);
+          return TaskStatus::complete;
+        },
+        this, pmesh);
 
     // 11. rhat0r <- (rhat0, r)
-    auto get_rhat0r =
-        DotProduct<rhat0, r>(correct_r, region, itl, i, reg_dep_id, &rhat0r, md);
+    auto get_rhat0r = DotProduct<rhat0, r>(correct_r, itl, &rhat0r, md);
 
     // 12. beta <- rhat0r / rhat0r_old * alpha / omega
     // 13. p <- r + beta * (p - omega * v)
     auto update_p = itl.AddTask(
-        get_rhat0r | get_res2,
+        TaskQualifier::local_sync, get_rhat0r | get_res2,
         [](BiCGSTABSolver *solver, std::shared_ptr<MeshData<Real>> &md) {
           Real alpha = solver->rhat0r_old / solver->rhat0v.val;
           Real omega = solver->ts.val / solver->tt.val;
@@ -259,15 +248,16 @@ class BiCGSTABSolver {
         this, md);
 
     // 14. rhat0r_old <- rhat0r, zero all reductions
-    region.AddRegionalDependencies(reg_dep_id, i, update_p | correct_x);
     Real *ptol = presidual_tolerance == nullptr ? &(params_.residual_tolerance) : presidual_tolerance;
-    auto check = itl.SetCompletionTask(
+    auto check = itl.AddTask(
+        TaskQualifier::completion | TaskQualifier::once_per_region |
+            TaskQualifier::global_sync,
         update_p | correct_x,
-        [](BiCGSTABSolver *solver, Mesh *pmesh, int partition, int max_iter,
-           Real *res_tol) {
-          if (partition != 0) return TaskStatus::complete;
+        [](BiCGSTABSolver *solver, Mesh *pmesh, int max_iter, Real *res_tol) {
           solver->iter_counter++;
           Real rms_res = std::sqrt(solver->residual.val / pmesh->GetTotalCells());
+          solver->final_residual = rms_res;
+          solver->final_iteration = solver->iter_counter;
           if (rms_res < *res_tol || solver->iter_counter >= max_iter) {
             solver->final_residual = rms_res;
             solver->final_iteration = solver->iter_counter;
@@ -281,9 +271,7 @@ class BiCGSTABSolver {
           solver->residual.val = 0.0;
           return TaskStatus::iterate;
         },
-        this, pmesh, i, params_.max_iters, ptol);
-    region.AddGlobalDependencies(reg_dep_id, i, check);
-    reg_dep_id++;
+        this, pmesh, params_.max_iters, ptol);
 
     return tl.AddTask(check, CopyData<x, u>, md);
   }
diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index feb39f9f635a..655f0c4eed44 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -23,8 +23,8 @@
 #include "interface/state_descriptor.hpp"
 #include "kokkos_abstraction.hpp"
 #include "solvers/solver_utils.hpp"
-#include "tasks/task_id.hpp"
-#include "tasks/task_list.hpp"
+
+#include "tasks/tasks.hpp"
 
 namespace parthenon {
 
@@ -94,62 +94,56 @@ class MGSolver {
     pkg->AddField(D::name(), mD);
   }
 
-  TaskID AddTasks(TaskList & /*tl*/, IterativeTasks &itl, TaskID dependence,
-                  int partition, Mesh *pmesh, TaskRegion &region, int &reg_dep_id) {
+  TaskID AddTasks(TaskList &tl, TaskID dependence, Mesh *pmesh, const int partition) {
     using namespace utils;
+    TaskID none;
+    auto [itl, solve_id] = tl.AddSublist(dependence, {1, this->params_.max_iters});
     iter_counter = 0;
     itl.AddTask(
-        dependence,
-        [](int partition, int *iter_counter) {
-          if (partition != 0 || *iter_counter > 0 || Globals::my_rank != 0)
-            return TaskStatus::complete;
+        TaskQualifier::once_per_region, none,
+        [](int *iter_counter) {
+          if (*iter_counter > 0 || Globals::my_rank != 0) return TaskStatus::complete;
           printf("# [0] v-cycle\n# [1] rms-residual\n# [2] rms-error\n");
           return TaskStatus::complete;
         },
-        partition, &iter_counter);
-    auto mg_finest =
-        AddLinearOperatorTasks(region, itl, dependence, partition, reg_dep_id, pmesh);
+        &iter_counter);
+    auto mg_finest = AddLinearOperatorTasks(itl, none, partition, pmesh);
     auto &md = pmesh->mesh_data.GetOrAdd("base", partition);
     auto comm = AddBoundaryExchangeTasks<BoundaryType::any>(mg_finest, itl, md, true);
     auto calc_pointwise_res = eqs_.template Ax<u, res_err>(itl, comm, md);
     calc_pointwise_res = itl.AddTask(
         calc_pointwise_res, AddFieldsAndStoreInteriorSelect<rhs, res_err, res_err>, md,
         1.0, -1.0, false);
-    auto get_res = DotProduct<res_err, res_err>(calc_pointwise_res, region, itl,
-                                                partition, reg_dep_id, &residual, md);
+    auto get_res = DotProduct<res_err, res_err>(calc_pointwise_res, itl, &residual, md);
 
-    auto check = itl.SetCompletionTask(
+    auto check = itl.AddTask(
+        TaskQualifier::once_per_region | TaskQualifier::completion |
+            TaskQualifier::global_sync,
         get_res,
-        [](MGSolver *solver, int part, Mesh *pmesh) {
-          if (part != 0) return TaskStatus::complete;
+        [](MGSolver *solver, Mesh *pmesh) {
           solver->iter_counter++;
           Real rms_res = std::sqrt(solver->residual.val / pmesh->GetTotalCells());
           if (Globals::my_rank == 0) printf("%i %e\n", solver->iter_counter, rms_res);
-          if (rms_res > solver->params_.residual_tolerance &&
-              solver->iter_counter < solver->params_.max_iters)
-            return TaskStatus::iterate;
           solver->final_residual = rms_res;
           solver->final_iteration = solver->iter_counter;
+          if (rms_res > solver->params_.residual_tolerance) return TaskStatus::iterate;
           return TaskStatus::complete;
         },
-        this, partition, pmesh);
-    region.AddGlobalDependencies(reg_dep_id, partition, check);
-    reg_dep_id++;
+        this, pmesh);
 
-    return check;
+    return solve_id;
   }
 
-  template <class TL_t>
-  TaskID AddLinearOperatorTasks(TaskRegion &region, TL_t &tl, TaskID dependence,
-                                int partition, int &reg_dep_id, Mesh *pmesh) {
+  TaskID AddLinearOperatorTasks(TaskList &tl, TaskID dependence, int partition,
+                                Mesh *pmesh) {
     using namespace utils;
     iter_counter = 0;
 
     int min_level = std::max(pmesh->GetGMGMaxLevel() - params_.max_coarsenings, 0);
     int max_level = pmesh->GetGMGMaxLevel();
 
-    return AddMultiGridTasksPartitionLevel(region, tl, dependence, partition, reg_dep_id,
-                                           max_level, min_level, max_level, pmesh);
+    return AddMultiGridTasksPartitionLevel(tl, dependence, partition, max_level,
+                                           min_level, max_level, pmesh);
   }
   
   template <class TL_t>
@@ -333,10 +327,9 @@ class MGSolver {
     return task_out;
   }
 
-  template <class TL_t>
-  TaskID AddMultiGridTasksPartitionLevel(TaskRegion &region, TL_t &tl, TaskID dependence,
-                                         int partition, int &reg_dep_id, int level,
-                                         int min_level, int max_level, Mesh *pmesh) {
+  TaskID AddMultiGridTasksPartitionLevel(TaskList &tl, TaskID dependence, int partition,
+                                         int level, int min_level, int max_level,
+                                         Mesh *pmesh) {
     using namespace utils;
     auto smoother = params_.smoother;
     bool do_FAS = params_.do_FAS;
@@ -369,10 +362,8 @@ class MGSolver {
       // Fill fields with restricted values
       auto recv_from_finer =
           tl.AddTask(dependence, ReceiveBoundBufs<BoundaryType::gmg_restrict_recv>, md_comm);
-      set_from_finer =
-          tl.AddTask(recv_from_finer, SetBounds<BoundaryType::gmg_restrict_recv>, md_comm);
-      region.AddRegionalDependencies(reg_dep_id, partition, set_from_finer);
-      reg_dep_id++;
+      set_from_finer = tl.AddTask( // TaskQualifier::local_sync, // is this required?
+          recv_from_finer, SetBounds<BoundaryType::gmg_restrict_recv>, md_comm);
       // 1. Copy residual from dual purpose communication field to the rhs, should be
       // actual RHS for finest level
       if (!do_FAS) {
@@ -421,19 +412,16 @@ class MGSolver {
       auto communicate_to_coarse =
           tl.AddTask(residual, SendBoundBufs<BoundaryType::gmg_restrict_send>, md_comm);
 
-      auto coarser = AddMultiGridTasksPartitionLevel(region, tl, communicate_to_coarse,
-                                                     partition, reg_dep_id, level - 1,
-                                                     min_level, max_level, pmesh);
+      auto coarser = AddMultiGridTasksPartitionLevel(
+          tl, communicate_to_coarse, partition, level - 1, min_level, max_level, pmesh);
 
       // 6. Receive error field into communication field and prolongate
       auto recv_from_coarser =
           tl.AddTask(coarser, ReceiveBoundBufs<BoundaryType::gmg_prolongate_recv>, md_comm);
       auto set_from_coarser =
           tl.AddTask(recv_from_coarser, SetBounds<BoundaryType::gmg_prolongate_recv>, md_comm);
-      auto prolongate = tl.AddTask(
+      auto prolongate = tl.AddTask( // TaskQualifier::local_sync, // is this required?
           set_from_coarser, ProlongateBounds<BoundaryType::gmg_prolongate_recv>, md_comm);
-      region.AddRegionalDependencies(reg_dep_id, partition, prolongate);
-      reg_dep_id++;
 
       // 7. Correct solution on this level with res_err field and store in
       //    communication field
diff --git a/src/solvers/solver_utils.hpp b/src/solvers/solver_utils.hpp
index 49092e906bca..a5ad37934619 100644
--- a/src/solvers/solver_utils.hpp
+++ b/src/solvers/solver_utils.hpp
@@ -280,32 +280,24 @@ TaskStatus DotProductLocal(const std::shared_ptr<MeshData<Real>> &md,
   return TaskStatus::complete;
 }
 
-template <class a_t, class b_t, class TL_t>
-TaskID DotProduct(TaskID dependency_in, TaskRegion &region, TL_t &tl, int partition,
-                  int &reg_dep_id, AllReduce<Real> *adotb,
+template <class a_t, class b_t>
+TaskID DotProduct(TaskID dependency_in, TaskList &tl, AllReduce<Real> *adotb,
                   const std::shared_ptr<MeshData<Real>> &md) {
   using namespace impl;
-  auto zero_adotb = (partition == 0 ? tl.AddTask(
-                                          dependency_in,
-                                          [](AllReduce<Real> *r) {
-                                            r->val = 0.0;
-                                            return TaskStatus::complete;
-                                          },
-                                          adotb)
-                                    : dependency_in);
-  region.AddRegionalDependencies(reg_dep_id, partition, zero_adotb);
-  reg_dep_id++;
-  auto get_adotb = tl.AddTask(zero_adotb, DotProductLocal<a_t, b_t>, md, adotb);
-  region.AddRegionalDependencies(reg_dep_id, partition, get_adotb);
-  reg_dep_id++;
-  auto start_global_adotb =
-      (partition == 0
-           ? tl.AddTask(get_adotb, &AllReduce<Real>::StartReduce, adotb, MPI_SUM)
-           : get_adotb);
+  auto zero_adotb = tl.AddTask(
+      TaskQualifier::once_per_region | TaskQualifier::local_sync, dependency_in,
+      [](AllReduce<Real> *r) {
+        r->val = 0.0;
+        return TaskStatus::complete;
+      },
+      adotb);
+  auto get_adotb = tl.AddTask(TaskQualifier::local_sync, zero_adotb,
+                              DotProductLocal<a_t, b_t>, md, adotb);
+  auto start_global_adotb = tl.AddTask(TaskQualifier::once_per_region, get_adotb,
+                                       &AllReduce<Real>::StartReduce, adotb, MPI_SUM);
   auto finish_global_adotb =
-      tl.AddTask(start_global_adotb, &AllReduce<Real>::CheckReduce, adotb);
-  region.AddRegionalDependencies(reg_dep_id, partition, finish_global_adotb);
-  reg_dep_id++;
+      tl.AddTask(TaskQualifier::once_per_region | TaskQualifier::local_sync,
+                 start_global_adotb, &AllReduce<Real>::CheckReduce, adotb);
   return finish_global_adotb;
 }
 
@@ -337,33 +329,22 @@ TaskStatus GlobalMinLocal(const std::shared_ptr<MeshData<Real>> &md,
   return TaskStatus::complete;
 }
 
-template <class a_t, class TL_t>
-TaskID GlobalMin(TaskID dependency_in, TaskRegion &region, TL_t &tl, int partition,
-                  int &reg_dep_id, AllReduce<Real> *amin,
+template <class a_t>
+TaskID GlobalMin(TaskID dependency_in, TaskList &tl, AllReduce<Real> *amin,
                   const std::shared_ptr<MeshData<Real>> &md) {
   using namespace impl;
-  auto zero_amin = (partition == 0 ? tl.AddTask(
-                                          dependency_in,
-                                          [](AllReduce<Real> *r) {
-                                            r->val = std::numeric_limits<Real>::max();
-                                            return TaskStatus::complete;
-                                          },
-                                          amin)
-                                    : dependency_in);
-  region.AddRegionalDependencies(reg_dep_id, partition, zero_amin);
-  reg_dep_id++;
-  auto get_amin = tl.AddTask(zero_amin, GlobalMinLocal<a_t>, md, amin);
-  region.AddRegionalDependencies(reg_dep_id, partition, get_amin);
-  reg_dep_id++;
-  auto start_global_amin =
-      (partition == 0
-           ? tl.AddTask(get_amin, &AllReduce<Real>::StartReduce, amin, MPI_MIN)
-           : get_amin);
-  auto finish_global_amin =
-      tl.AddTask(start_global_amin, &AllReduce<Real>::CheckReduce, amin);
-  region.AddRegionalDependencies(reg_dep_id, partition, finish_global_amin);
-  reg_dep_id++;
-  return finish_global_amin;
+  auto max_amin = tl.AddTask(
+      TaskQualifier::once_per_region | TaskQualifier::local_sync, dependency_in,
+      [](AllReduce<Real> *r) {
+        r->val = std::numeric_limits<Real>::max();
+        return TaskStatus::complete;
+      },
+      amin);
+  auto get_amin = tl.AddTask(TaskQualifier::local_sync, max_amin, GlobalMinLocal<a_t>, md, amin);
+  auto start_global_amin = tl.AddTask(TaskQualifier::once_per_region, get_amin, 
+                                      &AllReduce<Real>::StartReduce, amin, MPI_MIN);
+  return tl.AddTask(TaskQualifier::once_per_region | TaskQualifier::local_sync, 
+                    start_global_amin, &AllReduce<Real>::CheckReduce, amin);
 }
 
 } // namespace utils
diff --git a/src/tasks/task_id.cpp b/src/tasks/task_id.cpp
deleted file mode 100644
index a52f1e3419bc..000000000000
--- a/src/tasks/task_id.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-//========================================================================================
-// Athena++ astrophysical MHD code
-// Copyright(C) 2014 James M. Stone <jmstone@princeton.edu> and other code contributors
-// Licensed under the 3-clause BSD License, see LICENSE file for details
-//========================================================================================
-// (C) (or copyright) 2020. Triad National Security, LLC. All rights reserved.
-//
-// This program was produced under U.S. Government contract 89233218CNA000001 for Los
-// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
-// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
-// in the program are reserved by Triad National Security, LLC, and the U.S. Department
-// of Energy/National Nuclear Security Administration. The Government is granted for
-// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
-// license in this material to reproduce, prepare derivative works, distribute copies to
-// the public, perform publicly and display publicly, and to permit others to do so.
-//========================================================================================
-//! \file tasks.cpp
-//  \brief implementation of the TaskID class
-
-#include "utils/error_checking.hpp"
-#include "tasks/task_id.hpp"
-
-#include <algorithm>
-#include <bitset>
-#include <stdexcept>
-#include <string>
-#include <utility>
-
-namespace parthenon {
-
-// TaskID constructor. Default id = 0.
-TaskID::TaskID(int id) { Set(id); }
-
-void TaskID::Set(int id) {
-  if (id < 0) throw std::invalid_argument("TaskID requires integer arguments >= 0");
-  if (id == 0) {
-    bitblocks.resize(1);
-    return;
-  }
-  id--;
-  const int n_myblocks = id / BITBLOCK + 1;
-  // grow if necessary.  never shrink
-  if (n_myblocks > bitblocks.size()) bitblocks.resize(n_myblocks);
-  bitblocks[n_myblocks - 1] |= (static_cast<uint64_t>(1) << (id % BITBLOCK));
-  bit = id; 
-  nbits_set++;
-}
-
-void TaskID::clear() {
-  for (auto &bset : bitblocks) {
-    bset = 0;
-  }
-}
-
-bool TaskID::CheckDependencies(const TaskID &rhs) const {
-  const int n_myblocks = bitblocks.size();
-  const int n_srcblocks = rhs.bitblocks.size();
-  if (n_myblocks == n_srcblocks) {
-    for (int i = 0; i < n_myblocks; i++) {
-      if ((bitblocks[i] & rhs.bitblocks[i]) != rhs.bitblocks[i]) return false;
-    }
-  } else if (n_myblocks > n_srcblocks) {
-    for (int i = 0; i < n_srcblocks; i++) {
-      if ((bitblocks[i] & rhs.bitblocks[i]) != rhs.bitblocks[i]) return false;
-    }
-  } else {
-    for (int i = 0; i < n_myblocks; i++) {
-      if ((bitblocks[i] & rhs.bitblocks[i]) != rhs.bitblocks[i]) return false;
-    }
-    for (int i = n_myblocks; i < n_srcblocks; i++) {
-      if (rhs.bitblocks[i] > 0) return false;
-    }
-  }
-  return true;
-}
-
-void TaskID::SetFinished(const TaskID &rhs) {
-  const int n_myblocks = bitblocks.size();
-  const int n_srcblocks = rhs.bitblocks.size();
-  if (n_myblocks == n_srcblocks) {
-    for (int i = 0; i < n_myblocks; i++) {
-      bitblocks[i] ^= rhs.bitblocks[i];
-    }
-  } else if (n_myblocks > n_srcblocks) {
-    for (int i = 0; i < n_srcblocks; i++) {
-      bitblocks[i] ^= rhs.bitblocks[i];
-    }
-  } else {
-    for (int i = 0; i < n_myblocks; i++) {
-      bitblocks[i] ^= rhs.bitblocks[i];
-    }
-    for (int i = n_myblocks; i < n_srcblocks; i++) {
-      bitblocks.push_back(rhs.bitblocks[i]);
-    }
-  }
-}
-
-bool TaskID::operator==(const TaskID &rhs) const {
-  if (nbits_set != rhs.nbits_set) return false;
-  
-  const int n_myblocks = bitblocks.size();
-  const int n_srcblocks = rhs.bitblocks.size();
-  if (n_myblocks == n_srcblocks) {
-    for (int i = 0; i < n_myblocks; i++) {
-      if (bitblocks[i] != rhs.bitblocks[i]) return false;
-    }
-  } else if (n_myblocks > n_srcblocks) {
-    for (int i = 0; i < n_srcblocks; i++) {
-      if (bitblocks[i] != rhs.bitblocks[i]) return false;
-    }
-    for (int i = n_srcblocks; i < n_myblocks; i++) {
-      if (bitblocks[i] > 0) return false;
-    }
-  } else {
-    for (int i = 0; i < n_myblocks; i++) {
-      if (bitblocks[i] != rhs.bitblocks[i]) return false;
-    }
-    for (int i = n_myblocks; i < n_srcblocks; i++) {
-      if (rhs.bitblocks[i] > 0) return false;
-    }
-  }
-  return true;
-}
-
-bool TaskID::operator!=(const TaskID &rhs) const { return !operator==(rhs); }
-
-TaskID TaskID::operator|(const TaskID &rhs) const {
-  TaskID res;
-  const int n_myblocks = bitblocks.size();
-  const int n_srcblocks = rhs.bitblocks.size();
-  res.bitblocks.resize(std::max(n_myblocks, n_srcblocks));
-  if (n_myblocks == n_srcblocks) {
-    for (int i = 0; i < n_myblocks; i++) {
-      res.bitblocks[i] = bitblocks[i] | rhs.bitblocks[i];
-    }
-  } else if (n_myblocks > n_srcblocks) {
-    for (int i = 0; i < n_srcblocks; i++) {
-      res.bitblocks[i] = bitblocks[i] | rhs.bitblocks[i];
-    }
-    for (int i = n_srcblocks; i < n_myblocks; i++) {
-      res.bitblocks[i] = bitblocks[i];
-    }
-  } else {
-    for (int i = 0; i < n_myblocks; i++) {
-      res.bitblocks[i] = bitblocks[i] | rhs.bitblocks[i];
-    }
-    for (int i = n_myblocks; i < n_srcblocks; i++) {
-      res.bitblocks[i] = rhs.bitblocks[i];
-    }
-  }
-  return res;
-}
-
-std::string TaskID::to_string() const {
-  std::string bs;
-  for (int i = bitblocks.size() - 1; i >= 0; i--) {
-    //bs += bitblocks[i].to_string();
-  }
-  return bs;
-}
-
-} // namespace parthenon
diff --git a/src/tasks/task_list.hpp b/src/tasks/task_list.hpp
deleted file mode 100644
index 47ae93567b23..000000000000
--- a/src/tasks/task_list.hpp
+++ /dev/null
@@ -1,534 +0,0 @@
-//========================================================================================
-// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
-//
-// This program was produced under U.S. Government contract 89233218CNA000001 for Los
-// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
-// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
-// in the program are reserved by Triad National Security, LLC, and the U.S. Department
-// of Energy/National Nuclear Security Administration. The Government is granted for
-// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
-// license in this material to reproduce, prepare derivative works, distribute copies to
-// the public, perform publicly and display publicly, and to permit others to do so.
-//========================================================================================
-
-#ifndef TASKS_TASK_LIST_HPP_
-#define TASKS_TASK_LIST_HPP_
-
-#include <bitset>
-#include <iostream>
-#include <limits>
-#include <list>
-#include <map>
-#include <memory>
-#include <set>
-#include <stdexcept>
-#include <string>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "basic_types.hpp"
-#include "task_id.hpp"
-#include "task_types.hpp"
-#include "utils/error_checking.hpp"
-#include "utils/reductions.hpp"
-
-namespace parthenon {
-
-enum class TaskListStatus { running, stuck, complete, nothing_to_do };
-
-class TaskList;
-namespace task_list_impl {
-TaskID AddTaskHelper(TaskList *, Task);
-} // namespace task_list_impl
-
-class IterativeTasks {
- public:
-  IterativeTasks() = default;
-  IterativeTasks(TaskList *tl, int key) : tl_(tl), key_(key) {
-    max_iterations_ = std::numeric_limits<int>::max();
-  }
-
-  // overload to add member functions of class T to task list
-  // NOTE: we must capture the object pointer
-  template <class T, class U, class... Args1, class... Args2>
-  TaskID AddTask(TaskID const &dep, TaskStatus (T::*func)(Args1...), U *obj,
-                 Args2 &&...args) {
-    return this->AddTask_(TaskType::iterative, 1, dep, [=]() mutable -> TaskStatus {
-      return (obj->*func)(std::forward<Args2>(args)...);
-    });
-  }
-
-  template <class T, class... Args>
-  TaskID AddTask(TaskID const &dep, T &&func, Args &&...args) {
-    return AddTask_(TaskType::iterative, 1, dep, std::forward<T>(func),
-                    std::forward<Args>(args)...);
-  }
-
-  template <class T, class U, class... Args>
-  TaskID SetCompletionTask(TaskID const &dep, TaskStatus (T::*func)(Args...), U *obj,
-                           Args &&...args) {
-    return AddTask_(TaskType::completion_criteria, check_interval_, dep,
-                    [=]() mutable -> TaskStatus {
-                      return (obj->*func)(std::forward<Args>(args)...);
-                    });
-  }
-
-  template <class T, class... Args>
-  TaskID SetCompletionTask(TaskID const &dep, T &&func, Args &&...args) {
-    return AddTask_(TaskType::completion_criteria, check_interval_, dep,
-                    std::forward<T>(func), std::forward<Args>(args)...);
-  }
-
-  void SetMaxIterations(const int max) {
-    assert(max > 0);
-    max_iterations_ = max;
-  }
-  void SetCheckInterval(const int chk) {
-    assert(chk > 0);
-    check_interval_ = chk;
-  }
-  void SetFailWithMaxIterations(const bool flag) { throw_with_max_iters_ = flag; }
-  void SetWarnWithMaxIterations(const bool flag) { warn_with_max_iters_ = flag; }
-  bool ShouldThrowWithMax() const { return throw_with_max_iters_; }
-  bool ShouldWarnWithMax() const { return warn_with_max_iters_; }
-  int GetMaxIterations() const { return max_iterations_; }
-  int GetIterationCount() const { return count_; }
-  void IncrementCount() { count_++; }
-  void ResetCount() { count_ = 0; }
-  void PrintList() { std::cout << "tl_ = " << tl_ << std::endl; }
-
- private:
-  template <class F, class... Args>
-  TaskID AddTask_(const TaskType &type, const int interval, TaskID const &dep, F &&func,
-                  Args &&...args) {
-    TaskID id(0);
-    id = task_list_impl::AddTaskHelper(
-        tl_, Task(
-                 id, dep,
-                 [=, func = std::forward<F>(func)]() mutable -> TaskStatus {
-                   return func(std::forward<Args>(args)...);
-                 },
-                 type, key_));
-    return id;
-  }
-  TaskList *tl_;
-  int key_;
-  int max_iterations_;
-  unsigned int count_ = 0;
-  int check_interval_ = 1;
-  bool throw_with_max_iters_ = false;
-  bool warn_with_max_iters_ = true;
-};
-
-class TaskList {
- public:
-  TaskList() = default;
-  bool IsComplete() { return task_list_.empty(); }
-  int Size() { return task_list_.size(); }
-  void MarkRegional(const TaskID &id) {
-    for (auto &task : task_list_) {
-      if (task.GetID() == id) {
-        task.SetRegional();
-        break;
-      }
-    }
-  }
-  void MarkTaskComplete(const TaskID &id) { tasks_completed_.SetFinished(id); }
-  bool CheckDependencies(const TaskID &id) const {
-    return tasks_completed_.CheckDependencies(id);
-  }
-  bool CheckTaskRan(const TaskID &id) const {
-    for (auto &task : task_list_) {
-      if (task.GetID() == id) {
-        return (task.GetStatus() != TaskStatus::incomplete &&
-                task.GetStatus() != TaskStatus::skip &&
-                task.GetStatus() != TaskStatus::waiting);
-      }
-    }
-    return false;
-  }
-  bool CheckStatus(const TaskID &id, TaskStatus status) const {
-    for (auto &task : task_list_) {
-      if (task.GetID() == id) return (task.GetStatus() == status);
-    }
-    return true;
-  }
-  bool CheckTaskCompletion(const TaskID &id) const {
-    return CheckStatus(id, TaskStatus::complete);
-  }
-  void ClearComplete() {
-    auto task = task_list_.begin();
-    while (task != task_list_.end()) {
-      if (task->GetStatus() == TaskStatus::complete &&
-          task->GetType() != TaskType::iterative &&
-          task->GetType() != TaskType::completion_criteria && !task->IsRegional()) {
-        task = task_list_.erase(task);
-      } else {
-        ++task;
-      }
-    }
-    std::set<int> completed_iters;
-    for (auto &tsk : task_list_) {
-      if (tsk.GetType() == TaskType::completion_criteria &&
-          tsk.GetStatus() == TaskStatus::complete && !tsk.IsRegional()) {
-        completed_iters.insert(tsk.GetKey());
-      }
-    }
-    for (const auto &key : completed_iters) {
-      ClearIteration(key);
-    }
-  }
-  void ClearIteration(const int key) {
-    auto task = task_list_.begin();
-    while (task != task_list_.end()) {
-      if (task->GetKey() == key) {
-        task = task_list_.erase(task);
-      } else {
-        ++task;
-      }
-    }
-    iter_tasks[key].ResetCount();
-  }
-  void ResetIteration(const int key) {
-    PARTHENON_REQUIRE_THROWS(key < iter_tasks.size(), "Invalid iteration key");
-    iter_tasks[key].IncrementCount();
-    if (iter_tasks[key].GetIterationCount() == iter_tasks[key].GetMaxIterations()) {
-      if (iter_tasks[key].ShouldThrowWithMax()) {
-        PARTHENON_THROW("Iteration " + iter_labels[key] +
-                        " reached maximum allowed cycles without convergence.");
-      }
-      if (iter_tasks[key].ShouldWarnWithMax()) {
-        PARTHENON_WARN("Iteration " + iter_labels[key] +
-                       " reached maximum allowed cycles without convergence.");
-      }
-      for (auto &task : task_list_) {
-        if (task.GetKey() == key && task.GetType() == TaskType::completion_criteria) {
-          MarkTaskComplete(task.GetID());
-        }
-      }
-      ClearIteration(key);
-      return;
-    }
-    for (auto &task : task_list_) {
-      if (task.GetKey() == key) {
-        if (CheckDependencies(task.GetID())) {
-          MarkTaskComplete(task.GetID());
-        }
-        task.SetStatus(TaskStatus::incomplete);
-      }
-    }
-  }
-  void ResetIfNeeded(const TaskID &id) {
-    for (auto &task : task_list_) {
-      if (task.GetID() == id) {
-        if (task.GetType() == TaskType::completion_criteria) {
-          ResetIteration(task.GetKey());
-        }
-        break;
-      }
-    }
-  }
-  bool CompleteIfNeeded(const TaskID &id) {
-    MarkTaskComplete(id);
-    auto task = task_list_.begin();
-    while (task != task_list_.end()) {
-      if (task->GetID() == id) {
-        if (task->GetType() == TaskType::completion_criteria) {
-          ClearIteration(task->GetKey());
-          return true;
-        } else if (task->GetType() == TaskType::single) {
-          task = task_list_.erase(task);
-        } else {
-          task->SetStatus(TaskStatus::waiting);
-        }
-        break;
-      } else {
-        ++task;
-      }
-    }
-    return false;
-  }
-  void DoAvailable() {
-    auto task = task_list_.begin();
-    while (task != task_list_.end()) {
-      // first skip task if it's complete.  Possible for iterative tasks
-      if (task->GetStatus() != TaskStatus::incomplete) {
-        ++task;
-        continue;
-      }
-      const auto &dep = task->GetDependency();
-      if (CheckDependencies(dep)) {
-        (*task)();
-        if (task->GetStatus() == TaskStatus::complete && !task->IsRegional()) {
-          MarkTaskComplete(task->GetID());
-        } else if (task->GetStatus() == TaskStatus::skip &&
-                   task->GetType() == TaskType::completion_criteria) {
-          ResetIteration(task->GetKey());
-        } else if (task->GetStatus() == TaskStatus::iterate && !task->IsRegional()) {
-          ResetIteration(task->GetKey());
-        }
-      }
-      ++task;
-    }
-    ClearComplete();
-  }
-  bool Validate() const {
-    std::set<int> iters;
-    for (auto &task : task_list_) {
-      if (task.GetType() == TaskType::iterative) iters.insert(task.GetKey());
-    }
-    int num_iters = iters.size();
-    int found = 0;
-    for (auto &iter : iters) {
-      for (auto &task : task_list_) {
-        if (task.GetType() == TaskType::completion_criteria && task.GetKey() == iter) {
-          found++;
-          break;
-        }
-      }
-    }
-    bool valid = (found == num_iters);
-    PARTHENON_REQUIRE_THROWS(
-        valid,
-        "Task list validation found iterative tasks without a completion criteria");
-    return valid;
-  }
-
-  TaskID AddTask(Task &tsk) {
-    TaskID id(tasks_added_ + 1);
-    tsk.SetID(id);
-    task_list_.push_back(std::move(tsk));
-    tasks_added_++;
-    return id;
-  }
-
-  // overload to add member functions of class T to task list
-  // NOTE: we must capture the object pointer
-  template <class T, class U, class... Args1, class... Args2>
-  TaskID AddTask(TaskID const &dep, TaskStatus (T::*func)(Args1...), U *obj,
-                 Args2 &&...args) {
-    return this->AddTask(dep, [=]() mutable -> TaskStatus {
-      return (obj->*func)(std::forward<Args2>(args)...);
-    });
-  }
-
-  template <class F, class... Args>
-  TaskID AddTask(TaskID const &dep, F &&func, Args &&...args) {
-    TaskID id(tasks_added_ + 1);
-    task_list_.push_back(
-        Task(id, dep, [=, func = std::forward<F>(func)]() mutable -> TaskStatus {
-          return func(std::forward<Args>(args)...);
-        }));
-    tasks_added_++;
-    return id;
-  }
-
-  IterativeTasks &AddIteration(const std::string &label) {
-    int key = iter_tasks.size();
-    iter_tasks[key] = IterativeTasks(this, key);
-    iter_labels[key] = label;
-    return iter_tasks[key];
-  }
-
-  void Print() {
-    int i = 0;
-    std::cout << "TaskList::Print():" << std::endl;
-    for (auto &t : task_list_) {
-      std::cout << "  " << i << "  " << t.GetID().to_string() << "  "
-                << t.GetDependency().to_string() << " " << tasks_completed_.to_string()
-                << " " << (t.GetStatus() == TaskStatus::incomplete)
-                << (t.GetStatus() == TaskStatus::complete)
-                << (t.GetStatus() == TaskStatus::skip)
-                << (t.GetStatus() == TaskStatus::iterate)
-                << (t.GetStatus() == TaskStatus::fail) << std::endl;
-
-      i++;
-    }
-  }
-
- protected:
-  std::map<int, IterativeTasks> iter_tasks;
-  std::map<int, std::string> iter_labels;
-  std::list<Task> task_list_;
-  int tasks_added_ = 0;
-  TaskID tasks_completed_;
-};
-
-namespace task_list_impl {
-// helper function to avoid having to call a member function of TaskList from
-// IterativeTasks before TaskList has been defined
-inline TaskID AddTaskHelper(TaskList *tl, Task tsk) { return tl->AddTask(tsk); }
-} // namespace task_list_impl
-
-class RegionCounter {
- public:
-  explicit RegionCounter(const std::string &base) : base_(base), cnt_(0) {}
-  std::string ID() { return base_ + std::to_string(cnt_++); }
-
- private:
-  const std::string base_;
-  int cnt_;
-};
-
-class TaskRegion {
- public:
-  explicit TaskRegion(const int size) : lists(size) {}
-  void AddRegionalDependencies(const int reg_dep_id, const int list_index,
-                               const TaskID &id) {
-    AddRegionalDependencies(std::to_string(reg_dep_id), list_index, id);
-  }
-  void AddRegionalDependencies(const std::string &reg_dep_id, const int list_index,
-                               const TaskID &id) {
-    AddDependencies(reg_dep_id, list_index, id);
-    global[reg_dep_id] = false;
-  }
-  void AddGlobalDependencies(const int reg_dep_id, const int list_index,
-                             const TaskID &id) {
-    AddGlobalDependencies(std::to_string(reg_dep_id), list_index, id);
-  }
-  void AddGlobalDependencies(const std::string &reg_dep_id, const int list_index,
-                             const TaskID &id) {
-    AddDependencies(reg_dep_id, list_index, id);
-    global[reg_dep_id] = true;
-  }
-
-  TaskList &operator[](int i) { return lists[i]; }
-
-  int size() const { return lists.size(); }
-
-  bool Execute() {
-    for (auto i = 0; i < lists.size(); ++i) {
-      if (!lists[i].IsComplete()) {
-        lists[i].DoAvailable();
-      }
-    }
-    return CheckAndUpdate();
-  }
-
-  bool CheckAndUpdate() {
-    auto it = id_for_reg.begin();
-    while (it != id_for_reg.end()) {
-      auto &reg_id = it->first;
-      bool check = false;
-      if (HasRun(reg_id) && !all_done[reg_id].active) {
-        all_done[reg_id].val = IsComplete(reg_id);
-        if (global[reg_id]) {
-          all_done[reg_id].StartReduce(MPI_MIN);
-        } else {
-          check = true;
-        }
-      }
-      if (global[reg_id] && all_done[reg_id].active) {
-        auto status = all_done[reg_id].CheckReduce();
-        if (status == TaskStatus::complete) {
-          check = true;
-        }
-      }
-      if (check) {
-        if (all_done[reg_id].val) {
-          bool clear = false;
-          for (auto &lst : it->second) {
-            clear = lists[lst.first].CompleteIfNeeded(lst.second);
-          }
-          if (clear) {
-            all_done.erase(reg_id);
-            global.erase(reg_id);
-            it = id_for_reg.erase(it);
-          } else {
-            ++it;
-          }
-        } else {
-          for (auto &lst : it->second) {
-            lists[lst.first].ResetIfNeeded(lst.second);
-          }
-          all_done[reg_id].val = 0;
-          ++it;
-        }
-      } else {
-        ++it;
-      }
-    }
-    int complete_cnt = 0;
-    const int num_lists = size();
-    for (auto i = 0; i < num_lists; ++i) {
-      if (lists[i].IsComplete()) complete_cnt++;
-    }
-    return (complete_cnt == num_lists);
-  }
-
-  bool Validate() const {
-    for (auto &list : lists) {
-      if (!list.Validate()) return false;
-    }
-    return true;
-  }
-
- private:
-  void AddDependencies(const std::string &label, const int list_id, const TaskID &tid) {
-    id_for_reg[label][list_id] = tid;
-    lists[list_id].MarkRegional(tid);
-    all_done[label].val = 0;
-  }
-  bool HasRun(const std::string &reg_id) {
-    auto &lvec = id_for_reg[reg_id];
-    int n_to_run = lvec.size();
-    int n_ran = 0;
-    for (auto &[list_index, id] : lvec) {
-      if (lists[list_index].CheckTaskRan(id)) {
-        n_ran++;
-      }
-    }
-    return n_ran == n_to_run;
-  }
-  bool IsComplete(const std::string &reg_id) {
-    auto &lvec = id_for_reg[reg_id];
-    int n_to_finish = lvec.size();
-    int n_finished = 0;
-    for (auto &[list_index, id] : lvec) {
-      if (lists[list_index].CheckTaskCompletion(id)) {
-        n_finished++;
-      }
-    }
-    return n_finished == n_to_finish;
-  }
-
-  std::unordered_map<std::string, std::map<int, TaskID>> id_for_reg;
-  std::vector<TaskList> lists;
-  std::unordered_map<std::string, AllReduce<int>> all_done;
-  std::unordered_map<std::string, bool> global;
-};
-
-class TaskCollection {
- public:
-  TaskCollection() = default;
-  TaskRegion &AddRegion(const int num_lists) {
-    regions.push_back(TaskRegion(num_lists));
-    return regions.back();
-  }
-  TaskListStatus Execute() {
-    assert(Validate());
-    for (auto &region : regions) {
-      bool complete = false;
-      while (!complete) {
-        complete = region.Execute();
-      }
-    }
-    return TaskListStatus::complete;
-  }
-
- private:
-  bool Validate() const {
-    for (auto &region : regions) {
-      if (!region.Validate()) return false;
-    }
-    return true;
-  }
-
-  std::vector<TaskRegion> regions;
-};
-
-} // namespace parthenon
-
-#endif // TASKS_TASK_LIST_HPP_
diff --git a/src/tasks/task_types.hpp b/src/tasks/task_types.hpp
deleted file mode 100644
index 414169450980..000000000000
--- a/src/tasks/task_types.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-//========================================================================================
-// (C) (or copyright) 2021. Triad National Security, LLC. All rights reserved.
-//
-// This program was produced under U.S. Government contract 89233218CNA000001 for Los
-// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
-// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
-// in the program are reserved by Triad National Security, LLC, and the U.S. Department
-// of Energy/National Nuclear Security Administration. The Government is granted for
-// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
-// license in this material to reproduce, prepare derivative works, distribute copies to
-// the public, perform publicly and display publicly, and to permit others to do so.
-//========================================================================================
-
-#ifndef TASKS_TASK_TYPES_HPP_
-#define TASKS_TASK_TYPES_HPP_
-
-#include <chrono> // NOLINT [build/c++11]
-#include <functional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "basic_types.hpp"
-#include "globals.hpp"
-
-namespace parthenon {
-
-enum class TaskType { single, iterative, completion_criteria };
-
-class Task {
- public:
-  Task(const TaskID &id, const TaskID &dep, std::function<TaskStatus()> func)
-      : myid_(id), dep_(dep), type_(TaskType::single), key_(-1), func_(std::move(func)),
-        interval_(1) {}
-  Task(const TaskID &id, const TaskID &dep, std::function<TaskStatus()> func,
-       const TaskType &type, const int key)
-      : myid_(id), dep_(dep), type_(type), key_(key), func_(std::move(func)),
-        interval_(1) {
-    assert(key_ >= 0);
-    assert(type_ != TaskType::single);
-  }
-  Task(const TaskID &id, const TaskID &dep, std::function<TaskStatus()> func,
-       const TaskType &type, const int key, const int interval)
-      : myid_(id), dep_(dep), type_(type), key_(key), func_(std::move(func)),
-        interval_(interval) {
-    assert(key_ >= 0);
-    assert(type_ != TaskType::single);
-    assert(interval_ > 0);
-  }
-  void operator()() {
-    if (calls_ == 0) {
-      // on first call, set start time
-      start_time_ = std::chrono::high_resolution_clock::now();
-    }
-
-    calls_++;
-    if (calls_ % interval_ == 0) {
-      // set total runtime of current task, must go into Global namespace because
-      // functions called by the task functor don't have access to the task itself and
-      // they may want to check if the task has been running for too long indicating that
-      // it got stuck in an infinite loop
-      Globals::current_task_runtime_sec =
-          std::chrono::duration_cast<std::chrono::nanoseconds>(
-              std::chrono::high_resolution_clock::now() - start_time_)
-              .count() *
-          1e-9;
-      status_ = func_();
-      Globals::current_task_runtime_sec = 0.0;
-    } else {
-      status_ = TaskStatus::skip;
-    }
-  }
-  void SetID(const TaskID &id) { myid_ = id; }
-  const TaskID &GetID() const { return myid_; }
-  const TaskID &GetDependency() const { return dep_; }
-  TaskStatus GetStatus() const { return status_; }
-  void SetStatus(const TaskStatus &status) { status_ = status; }
-  TaskType GetType() const { return type_; }
-  int GetKey() const { return key_; }
-  void SetRegional() { regional_ = true; }
-  bool IsRegional() const { return regional_; }
-
- private:
-  TaskID myid_;
-  const TaskID dep_;
-  const TaskType type_;
-  const int key_;
-  TaskStatus status_ = TaskStatus::incomplete;
-  bool regional_ = false;
-  bool lb_time_ = false;
-  std::function<TaskStatus()> func_;
-  int calls_ = 0;
-  const int interval_;
-
-  // this is used to record the start time of the task so that we can check for how long
-  // the task been running and detect potential hangs, infinite loops, etc.
-  std::chrono::high_resolution_clock::time_point start_time_;
-};
-
-} // namespace parthenon
-
-#endif // TASKS_TASK_TYPES_HPP_
diff --git a/src/tasks/tasks.hpp b/src/tasks/tasks.hpp
new file mode 100644
index 000000000000..c0960787a53e
--- /dev/null
+++ b/src/tasks/tasks.hpp
@@ -0,0 +1,500 @@
+//========================================================================================
+// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+#ifndef TASKS_TASKS_HPP_
+#define TASKS_TASKS_HPP_
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <functional>
+#include <list>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <basic_types.hpp>
+#include <parthenon_mpi.hpp>
+
+#include "thread_pool.hpp"
+#include "utils/error_checking.hpp"
+
+namespace parthenon {
+
+enum class TaskListStatus { complete }; // doesn't feel like we need this...
+enum class TaskType { normal, completion };
+
+class TaskQualifier {
+ public:
+  using qualifier_t = uint64_t;
+  TaskQualifier() = delete;
+  TaskQualifier(const qualifier_t n) : flags(n) {} // NOLINT(runtime/explicit)
+
+  static inline constexpr qualifier_t normal{0};
+  static inline constexpr qualifier_t local_sync{1 << 0};
+  static inline constexpr qualifier_t global_sync{1 << 1};
+  static inline constexpr qualifier_t completion{1 << 2};
+  static inline constexpr qualifier_t once_per_region{1 << 3};
+
+  bool LocalSync() const { return flags & local_sync; }
+  bool GlobalSync() const { return flags & global_sync; }
+  bool Completion() const { return flags & completion; }
+  bool Once() const { return flags & once_per_region; }
+
+ private:
+  qualifier_t flags;
+};
+
+// forward declare Task for TaskID
+class Task;
+class TaskID {
+ public:
+  TaskID() : task(nullptr) {}
+  // pointers to Task are implicitly convertible to TaskID
+  TaskID(Task *t) : task(t) {} // NOLINT(runtime/explicit)
+
+  TaskID operator|(const TaskID &other) const {
+    // calling this operator means you're building a TaskID to hold a dependency
+    TaskID result;
+    if (task != nullptr)
+      result.dep.push_back(task);
+    else
+      result.dep.insert(result.dep.end(), dep.begin(), dep.end());
+    if (other.task != nullptr)
+      result.dep.push_back(other.task);
+    else
+      result.dep.insert(result.dep.end(), other.dep.begin(), other.dep.end());
+    return result;
+  }
+
+  const std::vector<Task *> &GetIDs() const { return std::cref(dep); }
+
+  bool empty() const { return (!task && dep.size() == 0); }
+  Task *GetTask() { return task; }
+
+ private:
+  Task *task = nullptr;
+  std::vector<Task *> dep;
+};
+
+class Task {
+ public:
+  Task() = default;
+  template <typename TID>
+  Task(TID &&dep, const std::function<TaskStatus()> &func,
+       std::pair<int, int> limits = {1, 1})
+      : f(func), exec_limits(limits) {
+    if (dep.GetIDs().size() == 0 && dep.GetTask()) {
+      dependencies.insert(dep.GetTask());
+    } else {
+      for (auto &d : dep.GetIDs()) {
+        dependencies.insert(d);
+      }
+    }
+    // always add "this" to repeat task if it's incomplete
+    dependent[static_cast<int>(TaskStatus::incomplete)].push_back(this);
+  }
+
+  TaskStatus operator()() {
+    auto status = f();
+    if (task_type == TaskType::completion) {
+      // keep track of how many times it's been called
+      num_calls += (status == TaskStatus::iterate || status == TaskStatus::complete);
+      // enforce minimum number of iterations
+      if (num_calls < exec_limits.first && status == TaskStatus::complete)
+        status = TaskStatus::iterate;
+      // enforce maximum number of iterations
+      if (num_calls == exec_limits.second) status = TaskStatus::complete;
+    }
+    // save the status in the Task object
+    SetStatus(status);
+    return status;
+  }
+  TaskID GetID() { return this; }
+  bool ready() {
+    // check that no dependency is incomplete
+    bool go = true;
+    for (auto &dep : dependencies) {
+      go = go && (dep->GetStatus() != TaskStatus::incomplete);
+    }
+    return go;
+  }
+  void AddDependency(Task *t) { dependencies.insert(t); }
+  std::unordered_set<Task *> &GetDependencies() { return dependencies; }
+  void AddDependent(Task *t, TaskStatus status) {
+    dependent[static_cast<int>(status)].push_back(t);
+  }
+  std::vector<Task *> &GetDependent(TaskStatus status = TaskStatus::complete) {
+    return dependent[static_cast<int>(status)];
+  }
+  void SetType(TaskType type) { task_type = type; }
+  TaskType GetType() { return task_type; }
+  void SetStatus(TaskStatus status) {
+    std::lock_guard<std::mutex> lock(mutex);
+    task_status = status;
+  }
+  TaskStatus GetStatus() {
+    std::lock_guard<std::mutex> lock(mutex);
+    return task_status;
+  }
+  void reset_iteration() { num_calls = 0; }
+
+ private:
+  std::function<TaskStatus()> f;
+  // store a list of tasks that might be available to
+  // run for each possible status this task returns
+  std::array<std::vector<Task *>, 3> dependent;
+  std::unordered_set<Task *> dependencies;
+  std::pair<int, int> exec_limits;
+  TaskType task_type = TaskType::normal;
+  int num_calls = 0;
+  TaskStatus task_status = TaskStatus::incomplete;
+  std::mutex mutex;
+};
+
+class TaskRegion;
+class TaskList {
+  friend class TaskRegion;
+
+ public:
+  TaskList() : TaskList(TaskID(), {1, 1}) {}
+  explicit TaskList(const TaskID &dep, std::pair<int, int> limits)
+      : dependency(dep), exec_limits(limits) {
+    // make a trivial first_task after which others will get launched
+    // simplifies logic for iteration and startup
+    tasks.push_back(std::make_shared<Task>(
+        dependency,
+        [&tasks = tasks]() {
+          for (auto &t : tasks) {
+            t->SetStatus(TaskStatus::incomplete);
+          }
+          return TaskStatus::complete;
+        },
+        exec_limits));
+    first_task = tasks.back().get();
+    // connect list dependencies to this list's first_task
+    for (auto t : first_task->GetDependencies()) {
+      t->AddDependent(first_task, TaskStatus::complete);
+    }
+
+    // make a trivial last_task that tasks dependent on this list's execution
+    // can depend on.  Also simplifies exiting completed iterations
+    tasks.push_back(std::make_shared<Task>(
+        TaskID(),
+        [&completion_tasks = completion_tasks]() {
+          for (auto t : completion_tasks) {
+            t->reset_iteration();
+          }
+          return TaskStatus::complete;
+        },
+        exec_limits));
+    last_task = tasks.back().get();
+  }
+
+  template <class... Args>
+  TaskID AddTask(TaskID dep, Args &&...args) {
+    return AddTask(TaskQualifier::normal, dep, std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  TaskID AddTask(const TaskQualifier tq, TaskID dep, Args &&...args) {
+    // user-space tasks always depend on something. if no dependencies are given,
+    // make the task dependent on the list's first_task
+    if (dep.empty()) dep = TaskID(first_task);
+
+    if (!tq.Once() || (tq.Once() && unique_id == 0)) {
+      AddUserTask(dep, std::forward<Args>(args)...);
+    } else {
+      tasks.push_back(std::make_shared<Task>(
+          dep, [=]() { return TaskStatus::complete; }, exec_limits));
+    }
+
+    Task *my_task = tasks.back().get();
+    TaskID id(my_task);
+
+    if (tq.LocalSync() || tq.GlobalSync() || tq.Once()) {
+      regional_tasks.push_back(my_task);
+    }
+
+    if (tq.GlobalSync()) {
+      bool do_mpi = false;
+#ifdef MPI_PARALLEL
+      // make status, request, and comm for this global task
+      global_status.push_back(std::make_shared<int>(0));
+      global_request.push_back(std::make_shared<MPI_Request>(MPI_REQUEST_NULL));
+      // be careful about the custom deleter so it doesn't call
+      // an MPI function after Finalize
+      global_comm.emplace_back(new MPI_Comm, [&](MPI_Comm *d) {
+        int finalized;
+        PARTHENON_MPI_CHECK(MPI_Finalized(&finalized));
+        if (!finalized) PARTHENON_MPI_CHECK(MPI_Comm_free(d));
+      });
+      // we need another communicator to support multiple in flight non-blocking
+      // collectives where we can't guarantee calling order across ranks
+      PARTHENON_MPI_CHECK(MPI_Comm_dup(MPI_COMM_WORLD, global_comm.back().get()));
+      do_mpi = true;
+#endif // MPI_PARALLEL
+      TaskID start;
+      // only call MPI once per region, on the list with unique_id = 0
+      if (unique_id == 0 && do_mpi) {
+#ifdef MPI_PARALLEL
+        // add a task that starts the Iallreduce on the task statuses
+        tasks.push_back(std::make_shared<Task>(
+            id,
+            [my_task, &stat = *global_status.back(), &req = *global_request.back(),
+             &comm = *global_comm.back()]() {
+              // jump through a couple hoops to figure out statuses of all instances of
+              // my_task accross all lists in the enclosing TaskRegion
+              auto dependent = my_task->GetDependent(TaskStatus::complete);
+              assert(dependent.size() == 1);
+              auto mytask = *dependent.begin();
+              stat = 0;
+              for (auto dep : mytask->GetDependencies()) {
+                stat = std::max(stat, static_cast<int>(dep->GetStatus()));
+              }
+              PARTHENON_MPI_CHECK(
+                  MPI_Iallreduce(MPI_IN_PLACE, &stat, 1, MPI_INT, MPI_MAX, comm, &req));
+              return TaskStatus::complete;
+            },
+            exec_limits));
+        start = TaskID(tasks.back().get());
+        // add a task that tests for completion of the Iallreduces of statuses
+        tasks.push_back(std::make_shared<Task>(
+            start,
+            [&stat = *global_status.back(), &req = *global_request.back()]() {
+              int check;
+              PARTHENON_MPI_CHECK(MPI_Test(&req, &check, MPI_STATUS_IGNORE));
+              if (check) {
+                return static_cast<TaskStatus>(stat);
+              }
+              return TaskStatus::incomplete;
+            },
+            exec_limits));
+#endif         // MPI_PARALLEL
+      } else { // unique_id != 0
+        // just add empty tasks
+        tasks.push_back(std::make_shared<Task>(
+            id, [&]() { return TaskStatus::complete; }, exec_limits));
+        start = TaskID(tasks.back().get());
+        tasks.push_back(std::make_shared<Task>(
+            start, [my_task]() { return my_task->GetStatus(); }, exec_limits));
+      }
+      // reset id so it now points at the task that finishes the Iallreduce
+      id = TaskID(tasks.back().get());
+      // make the task that starts the Iallreduce point at the one that finishes it
+      start.GetTask()->AddDependent(id.GetTask(), TaskStatus::complete);
+      // for any status != incomplete, my_task should point at the mpi reduction
+      my_task->AddDependent(start.GetTask(), TaskStatus::complete);
+      my_task->AddDependent(start.GetTask(), TaskStatus::iterate);
+      // make the finish Iallreduce task finish on all lists before moving on
+      regional_tasks.push_back(id.GetTask());
+    }
+
+    // connect completion tasks to last_task
+    if (tq.Completion()) {
+      auto t = id.GetTask();
+      t->SetType(TaskType::completion);
+      t->AddDependent(last_task, TaskStatus::complete);
+      completion_tasks.push_back(t);
+    }
+
+    // make connections so tasks point to this task to run next
+    for (auto d : my_task->GetDependencies()) {
+      if (d->GetType() == TaskType::completion) {
+        d->AddDependent(my_task, TaskStatus::iterate);
+      } else {
+        d->AddDependent(my_task, TaskStatus::complete);
+      }
+    }
+    return id;
+  }
+
+  template <typename TID>
+  std::pair<TaskList &, TaskID> AddSublist(TID &&dep, std::pair<int, int> minmax_iters) {
+    sublists.push_back(std::make_shared<TaskList>(dep, minmax_iters));
+    auto &tl = *sublists.back();
+    tl.SetID(unique_id);
+    return std::make_pair(std::ref(tl), TaskID(tl.last_task));
+  }
+
+ private:
+  TaskID dependency;
+  std::pair<int, int> exec_limits;
+  // put these in shared_ptrs so copying TaskList works as expected
+  std::vector<std::shared_ptr<Task>> tasks;
+  std::vector<std::shared_ptr<TaskList>> sublists;
+#ifdef MPI_PARALLEL
+  std::vector<std::shared_ptr<int>> global_status;
+  std::vector<std::shared_ptr<MPI_Request>> global_request;
+  std::vector<std::shared_ptr<MPI_Comm>> global_comm;
+#endif // MPI_PARALLEL
+  // vectors are fine for these
+  std::vector<Task *> regional_tasks;
+  std::vector<Task *> global_tasks;
+  std::vector<Task *> completion_tasks;
+  // special startup and takedown tasks auto added to lists
+  Task *first_task;
+  Task *last_task;
+  // a unique id to support tasks that should only get executed once per region
+  int unique_id;
+
+  Task *GetStartupTask() { return first_task; }
+  size_t NumRegional() const { return regional_tasks.size(); }
+  Task *Regional(const int i) { return regional_tasks[i]; }
+  void SetID(const int id) { unique_id = id; }
+
+  void ConnectIteration() {
+    if (completion_tasks.size() != 0) {
+      auto last = completion_tasks.back();
+      last->AddDependent(first_task, TaskStatus::iterate);
+    }
+    for (auto &tl : sublists)
+      tl->ConnectIteration();
+  }
+
+  template <class T, class U, class... Args1, class... Args2>
+  void AddUserTask(TaskID &dep, TaskStatus (T::*func)(Args1...), U *obj,
+                   Args2 &&...args) {
+    tasks.push_back(std::make_shared<Task>(
+        dep,
+        [=]() mutable -> TaskStatus {
+          return (obj->*func)(std::forward<Args2>(args)...);
+        },
+        exec_limits));
+  }
+
+  template <class F, class... Args>
+  void AddUserTask(TaskID &dep, F &&func, Args &&...args) {
+    tasks.push_back(std::make_shared<Task>(
+        dep,
+        [=, func = std::forward<F>(func)]() mutable -> TaskStatus {
+          return func(std::forward<Args>(args)...);
+        },
+        exec_limits));
+  }
+};
+
+class TaskRegion {
+ public:
+  TaskRegion() = delete;
+  explicit TaskRegion(const int num_lists) : task_lists(num_lists) {
+    for (int i = 0; i < num_lists; i++)
+      task_lists[i].SetID(i);
+  }
+
+  TaskListStatus Execute(ThreadPool &pool) {
+    // for now, require a pool with one thread
+    PARTHENON_REQUIRE_THROWS(pool.size() == 1,
+                             "ThreadPool size != 1 is not currently supported.")
+
+    // first, if needed, finish building the graph
+    if (!graph_built) BuildGraph();
+
+    // declare this so it can call itself
+    std::function<TaskStatus(Task *)> ProcessTask;
+    ProcessTask = [&pool, &ProcessTask](Task *task) -> TaskStatus {
+      auto status = task->operator()();
+      auto next_up = task->GetDependent(status);
+      for (auto t : next_up) {
+        if (t->ready()) {
+          pool.enqueue([t, &ProcessTask]() { return ProcessTask(t); });
+        }
+      }
+      return status;
+    };
+
+    // now enqueue the "first_task" for all task lists
+    for (auto &tl : task_lists) {
+      auto t = tl.GetStartupTask();
+      pool.enqueue([t, &ProcessTask]() { return ProcessTask(t); });
+    }
+
+    // then wait until everything is done
+    pool.wait();
+
+    return TaskListStatus::complete;
+  }
+
+  TaskList &operator[](const int i) { return task_lists[i]; }
+
+  size_t size() const { return task_lists.size(); }
+
+ private:
+  std::vector<TaskList> task_lists;
+  bool graph_built = false;
+
+  void BuildGraph() {
+    // first handle regional dependencies
+    const auto num_lists = task_lists.size();
+    const auto num_regional = task_lists.front().NumRegional();
+    std::vector<Task *> tasks(num_lists);
+    for (int i = 0; i < num_regional; i++) {
+      for (int j = 0; j < num_lists; j++) {
+        tasks[j] = task_lists[j].Regional(i);
+      }
+      std::vector<std::vector<Task *>> reg_dep;
+      for (int j = 0; j < num_lists; j++) {
+        reg_dep.push_back(std::vector<Task *>());
+        for (auto t : tasks[j]->GetDependent(TaskStatus::complete)) {
+          reg_dep[j].push_back(t);
+        }
+      }
+      for (int j = 0; j < num_lists; j++) {
+        for (auto t : reg_dep[j]) {
+          for (int k = 0; k < num_lists; k++) {
+            if (j == k) continue;
+            t->AddDependency(tasks[k]);
+            tasks[k]->AddDependent(t, TaskStatus::complete);
+          }
+        }
+      }
+    }
+
+    // now hook up iterations
+    for (auto &tl : task_lists) {
+      tl.ConnectIteration();
+    }
+
+    graph_built = true;
+  }
+};
+
+class TaskCollection {
+ public:
+  TaskCollection() = default;
+
+  TaskRegion &AddRegion(const int num_lists) {
+    regions.emplace_back(num_lists);
+    return regions.back();
+  }
+  TaskListStatus Execute() {
+    ThreadPool pool(1);
+    return Execute(pool);
+  }
+  TaskListStatus Execute(ThreadPool &pool) {
+    TaskListStatus status;
+    for (auto &region : regions) {
+      status = region.Execute(pool);
+      if (status != TaskListStatus::complete) return status;
+    }
+    return TaskListStatus::complete;
+  }
+
+ private:
+  std::list<TaskRegion> regions;
+};
+
+} // namespace parthenon
+
+#endif // TASKS_TASKS_HPP_
diff --git a/src/tasks/thread_pool.hpp b/src/tasks/thread_pool.hpp
new file mode 100644
index 000000000000..b8f526750230
--- /dev/null
+++ b/src/tasks/thread_pool.hpp
@@ -0,0 +1,139 @@
+//========================================================================================
+// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+
+#ifndef TASKS_THREAD_POOL_HPP_
+#define TASKS_THREAD_POOL_HPP_
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace parthenon {
+
+template <typename T>
+class ThreadQueue {
+ public:
+  explicit ThreadQueue(const int num_workers) : nworkers(num_workers), nwaiting(0) {}
+  void push(T q) {
+    std::lock_guard<std::mutex> lock(mutex);
+    queue.push(q);
+    cv.notify_one();
+  }
+  bool pop(T &q) {
+    std::unique_lock<std::mutex> lock(mutex);
+    if (queue.empty()) {
+      nwaiting++;
+      if (waiting && nwaiting == nworkers) {
+        complete = true;
+        complete_cv.notify_all();
+      }
+      cv.wait(lock, [this]() { return exit || !queue.empty(); });
+      nwaiting--;
+      if (exit) return true;
+    }
+    q = queue.front();
+    queue.pop();
+    return false;
+  }
+  void signal_kill() {
+    std::lock_guard<std::mutex> lock(mutex);
+    std::queue<T>().swap(queue);
+    complete = true;
+    exit = true;
+    cv.notify_all();
+  }
+  void signal_exit_when_finished() {
+    std::lock_guard<std::mutex> lock(mutex);
+    exit = true;
+    complete = true;
+    cv.notify_all();
+  }
+  void wait_for_complete() {
+    std::unique_lock<std::mutex> lock(mutex);
+    waiting = true;
+    if (queue.empty() && nwaiting == nworkers) {
+      complete = false;
+      waiting = false;
+      return;
+    }
+    complete_cv.wait(lock, [this]() { return complete; });
+    complete = false;
+    waiting = false;
+  }
+
+ private:
+  const int nworkers;
+  int nwaiting;
+  std::queue<T> queue;
+  std::mutex mutex;
+  std::condition_variable cv;
+  std::condition_variable complete_cv;
+  bool complete = false;
+  bool exit = false;
+  bool waiting = false;
+};
+
+class ThreadPool {
+ public:
+  explicit ThreadPool(const int numthreads = std::thread::hardware_concurrency())
+      : nthreads(numthreads), queue(nthreads) {
+    for (int i = 0; i < nthreads; i++) {
+      auto worker = [&]() {
+        while (true) {
+          std::function<void()> f;
+          auto stop = queue.pop(f);
+          if (stop) break;
+          if (f) f();
+        }
+      };
+      threads.emplace_back(worker);
+    }
+  }
+  ~ThreadPool() {
+    queue.signal_exit_when_finished();
+    for (auto &t : threads) {
+      t.join();
+    }
+  }
+
+  void wait() { queue.wait_for_complete(); }
+
+  void kill() { queue.signal_kill(); }
+
+  template <typename F, class... Args>
+  std::future<typename std::result_of<F(Args...)>::type> enqueue(F &&f, Args &&...args) {
+    using return_t = typename std::result_of<F(Args...)>::type;
+    auto task = std::make_shared<std::packaged_task<return_t()>>(
+        [=, func = std::forward<F>(f)] { return func(std::forward<Args>(args)...); });
+    std::future<return_t> result = task->get_future();
+    queue.push([task]() { (*task)(); });
+    return result;
+  }
+
+  int size() const { return nthreads; }
+
+ private:
+  const int nthreads;
+  std::vector<std::thread> threads;
+  ThreadQueue<std::function<void()>> queue;
+};
+
+} // namespace parthenon
+
+#endif // TASKS_THREAD_POOL_HPP_
diff --git a/src/utils/buffer_utils.cpp b/src/utils/buffer_utils.cpp
index 65213d9421c7..d2a2e418c83f 100644
--- a/src/utils/buffer_utils.cpp
+++ b/src/utils/buffer_utils.cpp
@@ -44,7 +44,7 @@ void PackData(ParArray4D<T> &src, BufArray1D<T> &buf, int sn, int en, int si, in
   const int nn = en + 1 - sn;
 
   pmb->par_for(
-      "PackData 4D", sn, en, sk, ek, sj, ej, si, ei,
+      PARTHENON_AUTO_LABEL, sn, en, sk, ek, sj, ej, si, ei,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
         buf(offset + i - si + ni * (j - sj + nj * (k - sk + nk * (n - sn)))) =
             src(n, k, j, i);
@@ -62,7 +62,7 @@ void PackZero(BufArray1D<T> &buf, int sn, int en, int si, int ei, int sj, int ej
   const int nn = en + 1 - sn;
 
   pmb->par_for(
-      "PackZero 4D", sn, en, sk, ek, sj, ej, si, ei,
+      PARTHENON_AUTO_LABEL, sn, en, sk, ek, sj, ej, si, ei,
       KOKKOS_LAMBDA(const int n, const int k, const int j, const int i) {
         buf(offset + i - si + ni * (j - sj + nj * (k - sk + nk * (n - sn)))) = 0.0;
       });
@@ -84,7 +84,7 @@ void PackData(ParArray3D<T> &src, BufArray1D<T> &buf, int si, int ei, int sj, in
   const int nk = ek + 1 - sk;
 
   pmb->par_for(
-      "PackData 3D", sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int k, int j, int i) {
+      PARTHENON_AUTO_LABEL, sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int k, int j, int i) {
         buf(offset + i - si + ni * (j - sj + nj * (k - sk))) = src(k, j, i);
       });
 
@@ -107,7 +107,7 @@ void UnpackData(BufArray1D<T> &buf, ParArray4D<T> &dst, int sn, int en, int si,
   const int nn = en + 1 - sn;
 
   pmb->par_for(
-      "UnpackData 4D", sn, en, sk, ek, sj, ej, si, ei,
+      PARTHENON_AUTO_LABEL, sn, en, sk, ek, sj, ej, si, ei,
       KOKKOS_LAMBDA(int n, int k, int j, int i) {
         dst(n, k, j, i) =
             buf(offset + i - si + ni * (j - sj + nj * (k - sk + nk * (n - sn))));
@@ -131,7 +131,7 @@ void UnpackData(BufArray1D<T> &buf, ParArray3D<T> &dst, int si, int ei, int sj,
   const int nk = ek + 1 - sk;
 
   pmb->par_for(
-      "UnpackData 3D", sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int k, int j, int i) {
+      PARTHENON_AUTO_LABEL, sk, ek, sj, ej, si, ei, KOKKOS_LAMBDA(int k, int j, int i) {
         dst(k, j, i) = buf(offset + i - si + ni * (j - sj + nj * (k - sk)));
       });
 
diff --git a/src/utils/index_split.cpp b/src/utils/index_split.cpp
new file mode 100644
index 000000000000..d4f2abe71596
--- /dev/null
+++ b/src/utils/index_split.cpp
@@ -0,0 +1,128 @@
+//========================================================================================
+// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+
+#include <algorithm>
+
+#include <Kokkos_Core.hpp>
+
+#include "utils/index_split.hpp"
+
+#include "basic_types.hpp"
+#include "defs.hpp"
+#include "globals.hpp"
+#include "interface/mesh_data.hpp"
+#include "kokkos_abstraction.hpp"
+#include "mesh/domain.hpp"
+#include "mesh/mesh.hpp"
+
+namespace parthenon {
+
+struct DummyFunctor {
+  DummyFunctor() = default;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(team_mbr_t team_member) const {}
+};
+
+IndexSplit::IndexSplit(MeshData<Real> *md, const IndexRange &kb, const IndexRange &jb,
+                       const IndexRange &ib, const int nkp, const int njp)
+    : nghost_(Globals::nghost), nkp_(nkp), njp_(njp), kbs_(kb.s), jbs_(jb.s), ibs_(ib.s),
+      ibe_(ib.e) {
+  Init(md, kb.e, jb.e);
+  ndim_ = md->GetNDim();
+}
+
+IndexSplit::IndexSplit(MeshData<Real> *md, IndexDomain domain, const int nkp,
+                       const int njp)
+    : nghost_(Globals::nghost), nkp_(nkp), njp_(njp) {
+  auto ib = md->GetBoundsI(domain);
+  auto jb = md->GetBoundsJ(domain);
+  auto kb = md->GetBoundsK(domain);
+  kbs_ = kb.s;
+  jbs_ = jb.s;
+  ibs_ = ib.s;
+  ibe_ = ib.e;
+  Init(md, kb.e, jb.e);
+  ndim_ = md->GetNDim();
+}
+
+void IndexSplit::Init(MeshData<Real> *md, const int kbe, const int jbe) {
+  const int total_k = kbe - kbs_ + 1;
+  const int total_j = jbe - jbs_ + 1;
+  const int total_i = ibe_ - ibs_ + 1;
+
+  // Compute max parallelism (at outer loop level) from Kokkos
+  // equivalent to NSMS in Kokkos
+  // TODO(JMM): I'm not sure if this is really the best way to do
+  // this. Based on discussion on Kokkos slack.
+#ifdef KOKKOS_ENABLE_CUDA
+  const auto space = DevExecSpace();
+  team_policy policy(space, (md->NumBlocks()) * total_k, Kokkos::AUTO);
+  // JMM: In principle, should pass a realistic functor here. Using a
+  // dummy because we don't know what's available.
+  // TODO(JMM): Should we expose the functor?
+  policy.set_scratch_size(1, Kokkos::PerTeam(sizeof(Real) * total_i * total_j));
+  const int nteams =
+      policy.team_size_recommended(DummyFunctor(), Kokkos::ParallelForTag());
+  concurrency_ = space.concurrency() / nteams;
+#else
+  concurrency_ = 1;
+#endif // KOKKOS_ENABLE_CUDA
+
+  if (nkp_ == all_outer)
+    nkp_ = total_k;
+  else if (nkp_ == no_outer)
+    nkp_ = 1;
+  if (njp_ == all_outer)
+    njp_ = total_j;
+  else if (njp_ == no_outer)
+    njp_ = 1;
+
+  if (nkp_ == 0) {
+#ifdef KOKKOS_ENABLE_CUDA
+    nkp_ = total_k;
+#else
+    nkp_ = 1;
+#endif
+  } else if (nkp_ > total_k) {
+    nkp_ = total_k;
+  }
+  if (njp_ == 0) {
+#ifdef KOKKOS_ENABLE_CUDA
+    // From Forrest Glines:
+    // nkp_ * njp_ >= number of SMs / number of streams
+    // => njp_ >= SMS / streams / NKP
+    njp_ = std::min(concurrency_ / (NSTREAMS_ * nkp_), total_j);
+#else
+    njp_ = 1;
+#endif
+  } else if (njp_ > total_j) {
+    njp_ = total_j;
+  }
+
+  // add a tiny bit to avoid round-off issues when we ultimately convert to int
+  // JMM: Do NOT cast these to integers here. The casting happens later.
+  // These being doubles is necessary for proper interleaving of work.
+  target_k_ = (1.0 * total_k) / nkp_ + 1.e-6;
+  target_j_ = (1.0 * total_j) / njp_ + 1.e-6;
+
+  // save the "entire" ranges
+  // don't bother save ".s" since it's always zero
+  auto ib = md->GetBoundsI(IndexDomain::entire);
+  auto jb = md->GetBoundsJ(IndexDomain::entire);
+  auto kb = md->GetBoundsK(IndexDomain::entire);
+  kbe_entire_ = kb.e;
+  jbe_entire_ = jb.e;
+  ibe_entire_ = ib.e;
+}
+
+} // namespace parthenon
diff --git a/src/utils/index_split.hpp b/src/utils/index_split.hpp
new file mode 100644
index 000000000000..2c560d5e247c
--- /dev/null
+++ b/src/utils/index_split.hpp
@@ -0,0 +1,118 @@
+//========================================================================================
+// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+
+#ifndef UTILS_INDEX_SPLIT_HPP_
+#define UTILS_INDEX_SPLIT_HPP_
+
+#include "basic_types.hpp"
+#include "defs.hpp"
+#include "globals.hpp"
+#include "mesh/domain.hpp"
+
+namespace parthenon {
+
+// forward declarations
+template <typename T>
+class MeshData;
+
+class IndexSplit {
+ public:
+  static constexpr int all_outer = -100;
+  static constexpr int no_outer = -200;
+  IndexSplit(MeshData<Real> *md, const IndexRange &kb, const IndexRange &jb,
+             const IndexRange &ib, const int nkp, const int njp);
+  IndexSplit(MeshData<Real> *md, IndexDomain domain, const int nkp, const int njp);
+
+  int outer_size() const { return nkp_ * njp_; }
+  KOKKOS_INLINE_FUNCTION
+  IndexRange GetBoundsK(const int p) const {
+    const auto kf = p / njp_;
+    return {kbs_ + static_cast<int>(kf * target_k_),
+            kbs_ + static_cast<int>((kf + 1) * target_k_) - 1};
+  }
+  KOKKOS_INLINE_FUNCTION
+  IndexRange GetBoundsJ(const int p) const {
+    const auto jf = p % njp_;
+    return {jbs_ + static_cast<int>(jf * target_j_),
+            jbs_ + static_cast<int>((jf + 1) * target_j_) - 1};
+  }
+  KOKKOS_INLINE_FUNCTION
+  IndexRange GetBoundsI() const { return {ibs_, ibe_}; }
+  KOKKOS_INLINE_FUNCTION
+  IndexRange GetBoundsI(const int p) const { return GetBoundsI(); }
+  KOKKOS_INLINE_FUNCTION
+  auto GetBoundsKJI(const int p) const {
+    const auto kb = GetBoundsK(p);
+    const auto jb = GetBoundsJ(p);
+    const auto ib = GetBoundsI(p);
+    return std::make_tuple(kb, jb, ib);
+  }
+  KOKKOS_INLINE_FUNCTION
+  IndexRange GetInnerBounds(const IndexRange &jb) const {
+    return {ibs_, (ibe_entire_ + 1) * (jb.e - jb.s + 1) - (ibe_entire_ - ibe_) - 1};
+  }
+  KOKKOS_INLINE_FUNCTION
+  IndexRange GetInnerBounds(const IndexRange &jb, const IndexRange &ib) const {
+    return {ib.s, (ibe_entire_ + 1) * (jb.e - jb.s + 1) - (ibe_entire_ - ib.e) - 1};
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool is_i_ghost(const int idx) const {
+    const int ni = ibe_entire_ + 1;
+    const int i = idx % ni;
+    const int i_inner_size = ni - 2 * nghost_;
+    return (i < nghost_ || i - nghost_ >= i_inner_size);
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool is_j_ghost(const int outer_idx, const int idx) const {
+    const int ni = ibe_entire_ + 1;
+    const int j = GetBoundsJ(outer_idx).s + idx / ni;
+    const int j_inner_size = jbe_entire_ + 1 - 2 * nghost_;
+    return (ndim_ > 1 && (j < nghost_ || j - nghost_ >= j_inner_size));
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool is_k_ghost(const int k) const {
+    const int k_inner_size = kbe_entire_ + 1 - 2 * nghost_;
+    return (ndim_ > 2 && (k < nghost_ || k - nghost_ >= k_inner_size));
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool is_ghost(const int outer_idx, const int k, const int idx) const {
+    return is_k_ghost(k) || is_j_ghost(outer_idx, idx) || is_i_ghost(idx);
+  }
+  KOKKOS_INLINE_FUNCTION
+  int get_max_ni() const { return ibe_entire_ + 1; }
+  // TODO(@jdolence) these overestimate max size...should probably fix
+  int get_max_nj() const { return (jbe_entire_ + 1) / njp_ + 1; }
+  int get_max_nk() const { return (kbe_entire_ + 1) / nkp_ + 1; }
+  // inner_size could be used to find the bounds for a loop that is collapsed over
+  // 1, 2, or 3 dimensions by providing the right starting and stopping indices
+  template <typename V>
+  KOKKOS_INLINE_FUNCTION int inner_size(const V &v, const IndexRange &kb,
+                                        const IndexRange &jb,
+                                        const IndexRange &ib) const {
+    return &v(0, kb.e, jb.e, ib.e) - &v(0, kb.s, jb.s, ib.s);
+  }
+
+ private:
+  // TODO(JMM): Replace this with a macro or something when available
+  static constexpr int NSTREAMS_ = 1; // Change if we add streams back
+  int concurrency_;                   //  = NSMs = 132 for NVIDIA H100
+  int nghost_, nkp_, njp_, kbs_, jbs_, ibs_, ibe_;
+  int kbe_entire_, jbe_entire_, ibe_entire_, ndim_;
+  float target_k_, target_j_;
+
+  void Init(MeshData<Real> *md, const int kbe, const int jbe);
+};
+
+} // namespace parthenon
+
+#endif // UTILS_INDEX_SPLIT_HPP_
diff --git a/src/utils/instrument.hpp b/src/utils/instrument.hpp
new file mode 100644
index 000000000000..17764657f5ca
--- /dev/null
+++ b/src/utils/instrument.hpp
@@ -0,0 +1,53 @@
+//========================================================================================
+// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+#ifndef UTILS_INSTRUMENT_HPP_
+#define UTILS_INSTRUMENT_HPP_
+
+#include <string>
+
+#include <Kokkos_Core.hpp>
+
+#define __UNIQUE_INST_VAR2(x, y) x##y
+#define __UNIQUE_INST_VAR(x, y) __UNIQUE_INST_VAR2(x, y)
+#define PARTHENON_INSTRUMENT                                                             \
+  KokkosTimer __UNIQUE_INST_VAR(internal_inst, __LINE__)(__FILE__, __LINE__, __func__);
+#define PARTHENON_INSTRUMENT_REGION(name)                                                \
+  KokkosTimer __UNIQUE_INST_VAR(internal_inst_reg, __LINE__)(name);
+#define PARTHENON_INSTRUMENT_REGION_PUSH                                                 \
+  Kokkos::Profiling::pushRegion(build_auto_label(__FILE__, __LINE__, __func__));
+#define PARTHENON_INSTRUMENT_REGION_POP Kokkos::Profiling::popRegion();
+#define PARTHENON_AUTO_LABEL parthenon::build_auto_label(__FILE__, __LINE__, __func__)
+
+namespace parthenon {
+
+inline std::string build_auto_label(const std::string &fullpath, const int line,
+                                    const std::string &name) {
+  size_t pos = fullpath.find_last_of("/\\");
+  std::string file = (pos != std::string::npos ? fullpath.substr(pos + 1) : fullpath);
+  return file + "::" + std::to_string(line) + "::" + name;
+}
+
+struct KokkosTimer {
+  KokkosTimer(const std::string &file, const int line, const std::string &name) {
+    Push(build_auto_label(file, line, name));
+  }
+  explicit KokkosTimer(const std::string &name) { Push(name); }
+  ~KokkosTimer() { Kokkos::Profiling::popRegion(); }
+
+ private:
+  void Push(const std::string &name) { Kokkos::Profiling::pushRegion(name); }
+};
+
+} // namespace parthenon
+
+#endif // UTILS_INSTRUMENT_HPP_
diff --git a/tst/regression/test_suites/advection_outflow/advection_outflow.py b/tst/regression/test_suites/advection_outflow/advection_outflow.py
index 5ab301b25fd5..bfb87717eba1 100644
--- a/tst/regression/test_suites/advection_outflow/advection_outflow.py
+++ b/tst/regression/test_suites/advection_outflow/advection_outflow.py
@@ -1,6 +1,6 @@
 # ========================================================================================
 # Parthenon performance portable AMR framework
-# Copyright(C) 2020 The Parthenon collaboration
+# Copyright(C) 2020-2024 The Parthenon collaboration
 # Licensed under the 3-clause BSD License, see LICENSE file for details
 # ========================================================================================
 # (C) (or copyright) 2020-2021. Triad National Security, LLC. All rights reserved.
@@ -41,6 +41,8 @@ def Analyse(self, parameters):
             print("Couldn't find module to compare Parthenon hdf5 files.")
             return False
 
+        test_passed = True
+
         delta = compare(
             [
                 "outflow.out0.final.phdf",
@@ -50,4 +52,30 @@ def Analyse(self, parameters):
             check_metadata=False,
         )
 
-        return delta == 0
+        if delta != 0:
+            print("Compare to gold standard failed. Files differ!")
+            test_passed = False
+
+        try:
+            from phdf import phdf
+        except ModuleNotFoundError:
+            print("Couldn't find module to open Parthenon hdf5 files.")
+            return False
+
+        ## compute the derived var 'manually' and compare to the output derived var
+        data_filename = "outflow.out0.final.phdf"
+        data_file = phdf(data_filename)
+        q = data_file.Get("advected")[0]
+        import numpy as np
+
+        my_derived_var = np.log10(q + 1.0e-5)
+        file_derived_var = data_file.Get("my_derived_var")[0]
+
+        try:
+            np.testing.assert_array_max_ulp(file_derived_var, my_derived_var)
+        except AssertionError as err:
+            print(err)
+            print("Mismatch between explicit and derived var in output.")
+            test_passed = False
+
+        return test_passed
diff --git a/tst/regression/test_suites/advection_outflow/parthinput.advection_outflow b/tst/regression/test_suites/advection_outflow/parthinput.advection_outflow
index 7a027f2a4a6e..13eab9a9440b 100644
--- a/tst/regression/test_suites/advection_outflow/parthinput.advection_outflow
+++ b/tst/regression/test_suites/advection_outflow/parthinput.advection_outflow
@@ -64,4 +64,4 @@ num_vars = 1 # number of variables in variable vector
 <parthenon/output0>
 file_type = hdf5
 dt = 0.5
-variables = advected
+variables = advected, my_derived_var
diff --git a/tst/regression/utils/test_case.py b/tst/regression/utils/test_case.py
index 51fdd2bffa0d..12302568fbd5 100644
--- a/tst/regression/utils/test_case.py
+++ b/tst/regression/utils/test_case.py
@@ -2,7 +2,7 @@
 # Copyright(C) 2014 James M. Stone <jmstone@princeton.edu> and other code contributors
 # Licensed under the 3-clause BSD License, see LICENSE file for details
 # ========================================================================================
-# (C) (or copyright) 2020-2021. Triad National Security, LLC. All rights reserved.
+# (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 #
 # This program was produced under U.S. Government contract 89233218CNA000001 for Los
 # Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -17,7 +17,7 @@
 import os
 from shutil import rmtree
 import subprocess
-from subprocess import PIPE
+from subprocess import PIPE, STDOUT
 import sys
 from shutil import which
 
@@ -242,7 +242,8 @@ def Run(self):
         print(" ".join(run_command))
         sys.stdout.flush()
         try:
-            proc = subprocess.run(run_command, check=True, stdout=PIPE, stderr=PIPE)
+            proc = subprocess.run(run_command, check=True, stdout=PIPE, stderr=STDOUT)
+            print(proc.stdout.decode())
             self.parameters.stdouts.append(proc.stdout)
         except subprocess.CalledProcessError as err:
             print("\n*****************************************************************")
diff --git a/tst/style/cpplint.py b/tst/style/cpplint.py
index 4df4a7d26033..c2c402f46295 100755
--- a/tst/style/cpplint.py
+++ b/tst/style/cpplint.py
@@ -7026,11 +7026,11 @@ def FlagCxx11Features(filename, clean_lines, linenum, error):
     # Flag unapproved C++11 headers.
     if include and include.group(1) in (
         "cfenv",
-        "condition_variable",
+        # "condition_variable",
         "fenv.h",
-        "future",
-        "mutex",
-        "thread",
+        # "future",
+        # "mutex",
+        # "thread",
         # "chrono",
         "ratio",
         # "regex",
diff --git a/tst/unit/CMakeLists.txt b/tst/unit/CMakeLists.txt
index 95f7ca3ecebd..05180d532340 100644
--- a/tst/unit/CMakeLists.txt
+++ b/tst/unit/CMakeLists.txt
@@ -26,12 +26,13 @@ list(APPEND unit_tests_SOURCES
     test_unit_domain.cpp
     test_unit_sort.cpp
     kokkos_abstraction.cpp
+    test_index_split.cpp
     test_logical_location.cpp
     test_metadata.cpp
-    test_pararrays.cpp
     test_meshblock_data_iterator.cpp
     test_mesh_data.cpp
     test_nan_tags.cpp
+    test_pararrays.cpp
     test_sparse_pack.cpp
     test_swarm.cpp
     test_required_desired.cpp
diff --git a/tst/unit/test_index_split.cpp b/tst/unit/test_index_split.cpp
new file mode 100644
index 000000000000..a327dc8adbb9
--- /dev/null
+++ b/tst/unit/test_index_split.cpp
@@ -0,0 +1,281 @@
+//========================================================================================
+// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+//
+// This program was produced under U.S. Government contract 89233218CNA000001 for Los
+// Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
+// for the U.S. Department of Energy/National Nuclear Security Administration. All rights
+// in the program are reserved by Triad National Security, LLC, and the U.S. Department
+// of Energy/National Nuclear Security Administration. The Government is granted for
+// itself and others acting on its behalf a nonexclusive, paid-up, irrevocable worldwide
+// license in this material to reproduce, prepare derivative works, distribute copies to
+// the public, perform publicly and display publicly, and to permit others to do so.
+//========================================================================================
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <catch2/catch.hpp>
+
+#include <Kokkos_Core.hpp>
+
+#include "basic_types.hpp"
+#include "globals.hpp"
+#include "interface/data_collection.hpp"
+#include "interface/mesh_data.hpp"
+#include "interface/meshblock_data.hpp"
+#include "interface/metadata.hpp"
+#include "interface/sparse_pack.hpp"
+#include "kokkos_abstraction.hpp"
+#include "mesh/meshblock.hpp"
+#include "parthenon/package.hpp"
+#include "utils/index_split.hpp"
+
+// TODO(jcd): can't call the MeshBlock constructor without mesh_refinement.hpp???
+#include "mesh/mesh_refinement.hpp"
+
+using namespace parthenon::package::prelude;
+using parthenon::BlockList_t;
+using parthenon::DevExecSpace;
+using parthenon::IndexDomain;
+using parthenon::IndexSplit;
+using parthenon::MeshBlock;
+using parthenon::MeshBlockData;
+using parthenon::MeshData;
+using parthenon::Metadata;
+using parthenon::PackIndexMap;
+using parthenon::par_for;
+using parthenon::Real;
+using parthenon::StateDescriptor;
+
+namespace {
+BlockList_t MakeBlockList(const std::shared_ptr<StateDescriptor> pkg, const int NBLOCKS,
+                          const int NSIDE, const int NDIM) {
+  BlockList_t block_list;
+  block_list.reserve(NBLOCKS);
+  for (int i = 0; i < NBLOCKS; ++i) {
+    auto pmb = std::make_shared<MeshBlock>(NSIDE, NDIM);
+    auto &pmbd = pmb->meshblock_data.Get();
+    pmbd->Initialize(pkg, pmb);
+    block_list.push_back(pmb);
+  }
+  return block_list;
+}
+// JMM: Variables aren't really needed for this test but...
+struct v1 : public parthenon::variable_names::base_t<false> {
+  template <class... Ts>
+  KOKKOS_INLINE_FUNCTION v1(Ts &&...args)
+      : parthenon::variable_names::base_t<false>(std::forward<Ts>(args)...) {}
+  static std::string name() { return "v1"; }
+};
+struct v3 : public parthenon::variable_names::base_t<false, 3> {
+  template <class... Ts>
+  KOKKOS_INLINE_FUNCTION v3(Ts &&...args)
+      : parthenon::variable_names::base_t<false, 3>(std::forward<Ts>(args)...) {}
+  static std::string name() { return "v3"; }
+};
+struct v5 : public parthenon::variable_names::base_t<false> {
+  template <class... Ts>
+  KOKKOS_INLINE_FUNCTION v5(Ts &&...args)
+      : parthenon::variable_names::base_t<false>(std::forward<Ts>(args)...) {}
+  static std::string name() { return "v5"; }
+};
+} // namespace
+
+TEST_CASE("IndexSplit", "[IndexSplit]") {
+  GIVEN("A set of meshblocks and meshblock and mesh data") {
+    constexpr int N = 6;
+    constexpr int NDIM = 3;
+    constexpr int NBLOCKS = 9;
+    const std::vector<int> scalar_shape{N, N, N};
+    const std::vector<int> vector_shape{N, N, N, 3};
+
+    Metadata m({Metadata::Independent, Metadata::WithFluxes}, scalar_shape);
+    Metadata m_vector({Metadata::Independent, Metadata::WithFluxes, Metadata::Vector},
+                      vector_shape);
+    auto pkg = std::make_shared<StateDescriptor>("Test package");
+    pkg->AddField(v1::name(), m);
+    pkg->AddField(v3::name(), m_vector);
+    pkg->AddField(v5::name(), m);
+    BlockList_t block_list = MakeBlockList(pkg, NBLOCKS, N, NDIM);
+
+    MeshData<Real> mesh_data("base");
+    mesh_data.Set(block_list, nullptr, NDIM);
+
+    WHEN("We initialize an IndexSplit with all outer k and no outer j") {
+      IndexSplit sp(&mesh_data, IndexDomain::interior, IndexSplit::all_outer,
+                    IndexSplit::no_outer);
+      THEN("The outer range should be appropriate") { REQUIRE(sp.outer_size() == N); }
+      THEN("The inner ranges should be appropriate") {
+        using atomic_view = Kokkos::MemoryTraits<Kokkos::Atomic>;
+        Kokkos::View<int *, atomic_view> nwrong("nwrong", 1);
+        parthenon::par_for_outer(
+            DEFAULT_OUTER_LOOP_PATTERN, "Test IndexSplit", DevExecSpace(), 0, 0, 0,
+            sp.outer_size() - 1, // N * N - 1
+            KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int outer_idx) {
+              const auto krange = sp.GetBoundsK(outer_idx);
+              const auto jrange = sp.GetBoundsJ(outer_idx);
+              const auto irange = sp.GetInnerBounds(jrange);
+              // JMM: Note that these are little cleaner without ghosts
+              if (!(krange.s == outer_idx)) nwrong(0) += 1;
+              if (!(krange.e == outer_idx)) nwrong(0) += 1;
+              if (!(jrange.s == 0)) nwrong(0) += 1;
+              if (!(jrange.e == N - 1)) nwrong(0) += 1;
+              if (!(irange.s == 0)) nwrong(0) += 1;
+              if (!(irange.e == (N * N - 1))) nwrong(0) += 1;
+            });
+        auto nwrong_h = Kokkos::create_mirror_view(nwrong);
+        Kokkos::deep_copy(nwrong_h, nwrong);
+        REQUIRE(nwrong_h(0) == 0);
+      }
+    }
+    WHEN("We initialize an IndexSplit with outer k and outer j") {
+      IndexSplit sp(&mesh_data, IndexDomain::interior, IndexSplit::all_outer,
+                    IndexSplit::all_outer);
+      THEN("the outer index range should be appropriate") {
+        REQUIRE(sp.outer_size() == (N * N));
+      }
+      THEN("The inner index ranges should be appropriate") {
+        using atomic_view = Kokkos::MemoryTraits<Kokkos::Atomic>;
+        Kokkos::View<int *, atomic_view> nwrong("nwrong", 1);
+        parthenon::par_for_outer(
+            DEFAULT_OUTER_LOOP_PATTERN, "Test IndexSplit", DevExecSpace(), 0, 0, 0,
+            sp.outer_size() - 1,
+            KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int outer_idx) {
+              const auto krange = sp.GetBoundsK(outer_idx);
+              const auto jrange = sp.GetBoundsJ(outer_idx);
+              const auto irange = sp.GetInnerBounds(jrange);
+              if (!(krange.s == krange.e)) nwrong(0) += 1;
+              if (!(jrange.s == jrange.e)) nwrong(0) += 1;
+              if (!(irange.s == 0)) nwrong(0) += 1;
+              if (!(irange.e == N - 1)) nwrong(0) += 1;
+            });
+        auto nwrong_h = Kokkos::create_mirror_view(nwrong);
+        Kokkos::deep_copy(nwrong_h, nwrong);
+        REQUIRE(nwrong_h(0) == 0);
+      }
+    }
+
+    WHEN("We initialize with nkp > NK") {
+      constexpr int NKP = N + 1;
+      REQUIRE(NKP > N);
+      IndexSplit sp(&mesh_data, IndexDomain::interior, NKP, IndexSplit::no_outer);
+      THEN("The outer index range should not overrun the mesh domain") {
+        REQUIRE(sp.outer_size() == N);
+      }
+    }
+
+    WHEN("We initialize with nkp*njp > NK*NJ") {
+      constexpr int NTOOBIG = N + 1;
+      REQUIRE(NTOOBIG > N);
+      IndexSplit sp(&mesh_data, IndexDomain::interior, NTOOBIG, NTOOBIG);
+      THEN("The outer index range should not overrun the mesh domain") {
+        REQUIRE(sp.outer_size() == N * N);
+      }
+    }
+
+    WHEN("We initialize an IndexSplit so that work and nj are evenly divisible") {
+      constexpr int NJP = 3;
+      REQUIRE(N % NJP == 0);
+      IndexSplit sp(&mesh_data, IndexDomain::interior, IndexSplit::all_outer, NJP);
+      THEN("The outer index range should be appropriate") {
+        REQUIRE(sp.outer_size() == NJP * N);
+      }
+      THEN("The inner index ranges should be appropriate") {
+        using atomic_view = Kokkos::MemoryTraits<Kokkos::Atomic>;
+        Kokkos::View<int *, atomic_view> nwrong("nwrong", 1);
+        parthenon::par_for_outer(
+            DEFAULT_OUTER_LOOP_PATTERN, "Test IndexSplit", DevExecSpace(), 0, 0, 0,
+            sp.outer_size() - 1,
+            KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int outer_idx) {
+              const auto krange = sp.GetBoundsK(outer_idx);
+              const auto jrange = sp.GetBoundsJ(outer_idx);
+              const auto irange = sp.GetInnerBounds(jrange);
+              if (!(krange.s == krange.e)) nwrong(0) += 1;
+              if (!(jrange.e == jrange.s + 1)) nwrong(0) += 1;
+              if (!((irange.e - irange.s + 1) == (N / NJP) * N)) nwrong(0) += 1;
+            });
+        auto nwrong_h = Kokkos::create_mirror_view(nwrong);
+        Kokkos::deep_copy(nwrong_h, nwrong);
+        REQUIRE(nwrong_h(0) == 0);
+      }
+    }
+
+    WHEN("We initialize an IndexSplit so that work and nk are evenly divisible") {
+      constexpr int NKP = 3;
+      REQUIRE(N % NKP == 0);
+      IndexSplit sp(&mesh_data, IndexDomain::interior, NKP, IndexSplit::no_outer);
+      THEN("The outer index range should be appropriate") {
+        REQUIRE(sp.outer_size() == NKP);
+      }
+      THEN("The inner index ranges should be appropriate") {
+        using atomic_view = Kokkos::MemoryTraits<Kokkos::Atomic>;
+        Kokkos::View<int *, atomic_view> nwrong("nwrong", 1);
+        parthenon::par_for_outer(
+            DEFAULT_OUTER_LOOP_PATTERN, "Test IndexSplit", DevExecSpace(), 0, 0, 0,
+            sp.outer_size() - 1,
+            KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int outer_idx) {
+              const auto krange = sp.GetBoundsK(outer_idx);
+              const auto jrange = sp.GetBoundsJ(outer_idx);
+              const auto irange = sp.GetInnerBounds(jrange);
+              // The user is expected to loop over k manually between
+              // the outer loop and the inner.
+              if (!((krange.e - krange.s + 1) == (N / NKP))) nwrong(0) += 1;
+              if (!((jrange.e - jrange.s + 1) == N)) nwrong(0) += 1;
+              if (!((irange.e - irange.s + 1) == (N * N))) nwrong(0) += 1;
+            });
+        auto nwrong_h = Kokkos::create_mirror_view(nwrong);
+        Kokkos::deep_copy(nwrong_h, nwrong);
+        REQUIRE(nwrong_h(0) == 0);
+      }
+    }
+
+    WHEN("We initialize an IndexSplit so the work and nj aren't evenly divisible") {
+      constexpr int NJP = 4;
+      REQUIRE(N % NJP > 0);
+      IndexSplit sp(&mesh_data, IndexDomain::interior, IndexSplit::all_outer, NJP);
+      THEN("The outer index range should be appropriate") {
+        REQUIRE(sp.outer_size() == NJP * N);
+      }
+      THEN("The inner index ranges should be appropriate") {
+        int total_work = 0;
+        const int outer_size = sp.outer_size();
+        parthenon::par_reduce(
+            parthenon::loop_pattern_flatrange_tag, "Test IndexSplit", DevExecSpace(), 0,
+            outer_size - 1,
+            KOKKOS_LAMBDA(const int outer_idx, int &total_work) {
+              const auto krange = sp.GetBoundsK(outer_idx);
+              const auto jrange = sp.GetBoundsJ(outer_idx);
+              const auto irange = sp.GetInnerBounds(jrange);
+              const int local_work =
+                  (krange.e - krange.s + 1) * (irange.e - irange.s + 1);
+              total_work += local_work;
+            },
+            Kokkos::Sum<int>(total_work));
+        REQUIRE(total_work == N * N * N);
+      }
+    }
+
+    WHEN("We initialize an IndexSplit so the work and nk aren't evenly divisible") {
+      constexpr int NKP = 4;
+      REQUIRE(N % NKP > 0);
+      IndexSplit sp(&mesh_data, IndexDomain::interior, NKP, IndexSplit::no_outer);
+      THEN("The outer index range should be appropriate") {
+        REQUIRE(sp.outer_size() == NKP);
+      }
+      THEN("The inner index ranges should be appropriate") {
+        int total_work = 0;
+        parthenon::par_reduce(
+            parthenon::loop_pattern_flatrange_tag, "Test IndexSplit", DevExecSpace(), 0,
+            sp.outer_size() - 1,
+            KOKKOS_LAMBDA(const int outer_idx, int &total_work) {
+              const auto krange = sp.GetBoundsK(outer_idx);
+              const auto jrange = sp.GetBoundsJ(outer_idx);
+              const auto irange = sp.GetInnerBounds(jrange);
+              total_work += (krange.e - krange.s + 1) * (irange.e - irange.s + 1);
+            },
+            Kokkos::Sum<int>(total_work));
+        REQUIRE(total_work == N * N * N);
+      }
+    }
+  }
+}
diff --git a/tst/unit/test_metadata.cpp b/tst/unit/test_metadata.cpp
index f8c246066526..1c7c64acd743 100644
--- a/tst/unit/test_metadata.cpp
+++ b/tst/unit/test_metadata.cpp
@@ -229,18 +229,6 @@ TEST_CASE("Refinement Information in Metadata", "[Metadata]") {
       }
     }
   }
-  // JMM: I also wanted to test registration of refinement operations
-  // but this turns out to be impossible because Catch2 macros are not
-  // careful with commas, and the macro interprets commas within the
-  // template as separate arguments.
-  GIVEN("A metadata struct without the relevant flags set") {
-    Metadata m;
-    WHEN("We try to request refinement functions") {
-      THEN("It should fail") {
-        REQUIRE_THROWS_AS(m.GetRefinementFunctions(), std::runtime_error);
-      }
-    }
-  }
   GIVEN("A simple metadata object") {
     using FlagVec = std::vector<parthenon::MetadataFlag>;
     Metadata m(FlagVec{Metadata::Derived, Metadata::OneCopy});
diff --git a/tst/unit/test_sparse_pack.cpp b/tst/unit/test_sparse_pack.cpp
index 013fadda811f..4efea18a5054 100644
--- a/tst/unit/test_sparse_pack.cpp
+++ b/tst/unit/test_sparse_pack.cpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
@@ -77,13 +77,80 @@ struct v5 : public parthenon::variable_names::base_t<false> {
   static std::string name() { return "v5"; }
 };
 
+using parthenon::variable_names::ANYDIM;
+struct v7 : public parthenon::variable_names::base_t<false, ANYDIM, 3> {
+  template <class... Ts>
+  KOKKOS_INLINE_FUNCTION v7(Ts &&...args)
+      : parthenon::variable_names::base_t<false, ANYDIM, 3>(std::forward<Ts>(args)...) {}
+  static std::string name() { return "v7"; }
+};
+
 } // namespace
 
 TEST_CASE("Test behavior of sparse packs", "[SparsePack]") {
+  constexpr int N = 6;
+  constexpr int NDIM = 3;
+  constexpr int NBLOCKS = 9;
+
+  GIVEN("A tensor variable on a mesh") {
+    const std::vector<int> tensor_shape{N, N, N, 3, 3};
+    Metadata m_tensor({Metadata::Independent}, tensor_shape);
+    auto pkg = std::make_shared<StateDescriptor>("Test package");
+    pkg->AddField<v7>(m_tensor);
+    BlockList_t block_list = MakeBlockList(pkg, NBLOCKS, N, NDIM);
+
+    MeshData<Real> mesh_data("base");
+    mesh_data.Set(block_list, nullptr);
+
+    WHEN("We initialize the independent variables by hand and deallocate one") {
+      auto ib = block_list[0]->cellbounds.GetBoundsI(IndexDomain::entire);
+      auto jb = block_list[0]->cellbounds.GetBoundsJ(IndexDomain::entire);
+      auto kb = block_list[0]->cellbounds.GetBoundsK(IndexDomain::entire);
+      for (int b = 0; b < NBLOCKS; ++b) {
+        auto &pmb = block_list[b];
+        auto &pmbd = pmb->meshblock_data.Get();
+        auto var = pmbd->Get("v7");
+        auto var5 = var.data.Get<5>();
+        int slower_rank = var5.GetDim(5);
+        int faster_rank = var5.GetDim(4);
+        par_for(
+            loop_pattern_mdrange_tag, "initializev7", DevExecSpace(), kb.s, kb.e, jb.s,
+            jb.e, ib.s, ib.e, KOKKOS_LAMBDA(int k, int j, int i) {
+              for (int l = 0; l < slower_rank; ++l) {
+                for (int m = 0; m < faster_rank; ++m) {
+                  Real n = m + 1e1 * l;
+                  var5(l, m, k, j, i) = n;
+                }
+              }
+            });
+      }
+      THEN("A sparse pack can correctly index into tensor types") {
+        auto desc = parthenon::MakePackDescriptor<v7>(pkg.get());
+        auto sparse_pack = desc.GetPack(&mesh_data);
+        int nwrong = 0;
+        int nl = tensor_shape[4];
+        int nm = tensor_shape[3];
+        par_reduce(
+            loop_pattern_mdrange_tag, "check vector", DevExecSpace(), 0,
+            sparse_pack.GetNBlocks() - 1, kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
+            KOKKOS_LAMBDA(int b, int k, int j, int i, int &ltot) {
+              // 0-th is ANYDIM, 1st is 3.
+              for (int l = 0; l < nl; ++l) {
+                for (int m = 0; m < nm; ++m) {
+                  Real n = m + 1e1 * l;
+                  if (sparse_pack(b, v7(l, m), k, j, i) != n) {
+                    ltot += 1;
+                  }
+                }
+              }
+            },
+            nwrong);
+        REQUIRE(nwrong == 0);
+      }
+    }
+  }
+
   GIVEN("A set of meshblocks and meshblock and mesh data") {
-    constexpr int N = 6;
-    constexpr int NDIM = 3;
-    constexpr int NBLOCKS = 9;
     const std::vector<int> scalar_shape{N, N, N};
     const std::vector<int> vector_shape{N, N, N, 3};
 
@@ -122,6 +189,7 @@ TEST_CASE("Test behavior of sparse packs", "[SparsePack]") {
               });
         }
       }
+
       // Deallocate a variable on an arbitrary block
       block_list[2]->DeallocateSparse("v3");
 
@@ -136,6 +204,17 @@ TEST_CASE("Test behavior of sparse packs", "[SparsePack]") {
         REQUIRE(hi == 0); // hi is scalar. Only one value.
       }
 
+      THEN("A sparse pack correctly loads this data and can report existence and "
+           "nonexistence for variables on different blocks.") {
+        auto desc = parthenon::MakePackDescriptor<v1, v3, v5>(pkg.get());
+        auto pack = desc.GetPack(&mesh_data);
+        REQUIRE(pack.ContainsHost(2, v1()));
+        REQUIRE(!pack.ContainsHost(2, v3()));
+        REQUIRE(pack.ContainsHost(2, v5()));
+        REQUIRE(!pack.ContainsHost(2, v1(), v3(), v5()));
+        REQUIRE(pack.ContainsHost<v1, v5>(2));
+      }
+
       THEN("A sparse pack correctly loads this data and can be read from v3 on all "
            "blocks") {
         // Create a pack use type variables
diff --git a/tst/unit/test_state_descriptor.cpp b/tst/unit/test_state_descriptor.cpp
index 32f22820de0b..706f4c73c74d 100644
--- a/tst/unit/test_state_descriptor.cpp
+++ b/tst/unit/test_state_descriptor.cpp
@@ -383,7 +383,7 @@ TEST_CASE("Test dependency resolution in StateDescriptor", "[StateDescriptor]")
       }
     }
 
-    WHEN("We register a dense variable custom prolongation/restriction") {
+    WHEN("We register a dense variable with default prolongation/restriction") {
       pkg1->AddField("dense", m_provides);
       WHEN("We register a sparse variable with custom prolongation/restriction") {
         auto m_sparse_provides_ = m_sparse_provides;
@@ -391,8 +391,8 @@ TEST_CASE("Test dependency resolution in StateDescriptor", "[StateDescriptor]")
         pkg2->AddSparsePool("sparse", m_sparse_provides_, sparse_ids);
         THEN("We can perform dependency resolution") {
           auto pkg3 = ResolvePackages(packages);
-          AND_THEN("The two relevant prolongation restriction operators exist and have "
-                   "unique ids") {
+          AND_THEN("The two relevant prolongation restriction operators exist, are "
+                   "appropriately set, and have unique ids") {
             const auto my_funcs =
                 parthenon::refinement::RefinementFunctions_t::RegisterOps<MyProlongOp,
                                                                           MyRestrictOp>();
@@ -403,6 +403,12 @@ TEST_CASE("Test dependency resolution in StateDescriptor", "[StateDescriptor]")
             REQUIRE(pkg3->NumRefinementFuncs() == 2);
             REQUIRE((pkg3->RefinementFuncID(my_funcs)) !=
                     (pkg3->RefinementFuncID(cell_funcs)));
+            REQUIRE(pkg3->FieldMetadata("dense").GetRefinementFunctions() == cell_funcs);
+            for (int i = 0; i < sparse_ids.size(); i++) {
+              REQUIRE(
+                  pkg3->FieldMetadata("sparse", sparse_ids[i]).GetRefinementFunctions() ==
+                  my_funcs);
+            }
           }
         }
       }
diff --git a/tst/unit/test_swarm.cpp b/tst/unit/test_swarm.cpp
index 7bfef8b5aa62..f94ed51333c5 100644
--- a/tst/unit/test_swarm.cpp
+++ b/tst/unit/test_swarm.cpp
@@ -1,9 +1,9 @@
 //========================================================================================
 // Parthenon performance portable AMR framework
-// Copyright(C) 2020 The Parthenon collaboration
+// Copyright(C) 2020-2024 The Parthenon collaboration
 // Licensed under the 3-clause BSD License, see LICENSE file for details
 //========================================================================================
-// (C) (or copyright) 2020-2022. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001
 // for Los Alamos National Laboratory (LANL), which is operated by Triad
@@ -26,6 +26,7 @@
 
 #include "bvals/bvals_interfaces.hpp"
 #include "interface/swarm.hpp"
+#include "kokkos_abstraction.hpp"
 #include "mesh/mesh.hpp"
 
 #include <parthenon/driver.hpp>
@@ -41,6 +42,7 @@ using parthenon::MeshBlock;
 using parthenon::Metadata;
 using parthenon::Packages_t;
 using parthenon::ParameterInput;
+using parthenon::ParArray1D;
 using parthenon::ParArrayND;
 using parthenon::ParticleBound;
 using parthenon::Swarm;
@@ -118,8 +120,7 @@ TEST_CASE("Swarm memory management", "[Swarm]") {
   Metadata m_integer({Metadata::Integer, Metadata::Particle});
   swarm->Add(labelVector, m_integer);
 
-  ParArrayND<int> new_indices;
-  auto new_mask = swarm->AddEmptyParticles(1, new_indices);
+  swarm->AddEmptyParticles(1);
   swarm_d = swarm->GetDeviceContext();
   auto x_d = swarm->Get<Real>("x").Get();
   auto x_h = x_d.GetHostMirrorAndCopy();
@@ -132,7 +133,7 @@ TEST_CASE("Swarm memory management", "[Swarm]") {
   x_d.DeepCopy(x_h);
   i_d.DeepCopy(i_h);
 
-  new_mask = swarm->AddEmptyParticles(11, new_indices);
+  swarm->AddEmptyParticles(11);
   swarm_d = swarm->GetDeviceContext();
   x_d = swarm->Get<Real>("x").Get();
   i_d = swarm->Get<int>("i").Get();
@@ -212,7 +213,7 @@ TEST_CASE("Swarm memory management", "[Swarm]") {
   REQUIRE(i_h(1) == 2);
 
   // "Transport" a particle across the IX1 (custom) boundary
-  ParArrayND<int> bc_indices("Boundary indices", 1);
+  ParArray1D<int> bc_indices("Boundary indices", 1);
   meshblock->par_for(
       "Transport", 0, 0, KOKKOS_LAMBDA(const int n) {
         x_d(0) = -0.6;
diff --git a/tst/unit/test_taskid.cpp b/tst/unit/test_taskid.cpp
index 0844226ad48d..14dcc09b500a 100644
--- a/tst/unit/test_taskid.cpp
+++ b/tst/unit/test_taskid.cpp
@@ -19,34 +19,20 @@
 
 #include <catch2/catch.hpp>
 
-#include "tasks/task_id.hpp"
+#include "tasks/tasks.hpp"
 
+using parthenon::Task;
 using parthenon::TaskID;
 
-TEST_CASE("Just check everything", "[CheckDependencies][SetFinished][equal][or]") {
+TEST_CASE("Just check everything", "[GetIDs][empty]") {
   GIVEN("Some TaskIDs") {
-    TaskID a(1);
-    TaskID b(2);
-    TaskID c(BITBLOCK + 1); // make sure we get a task with more than one block
-    TaskID complete;
-
-    TaskID ac = (a | c);
-    bool should_be_false = ac.CheckDependencies(b);
-    bool should_be_truea = ac.CheckDependencies(a);
-    bool should_be_truec = ac.CheckDependencies(c);
-    TaskID abc = (a | b | c);
-    complete.SetFinished(abc);
-    bool equal_true = (complete == abc);
-    bool equal_false = (complete == ac);
-
-    REQUIRE(should_be_false == false);
-    REQUIRE(should_be_truea == true);
-    REQUIRE(should_be_truec == true);
-    REQUIRE(equal_true == true);
-    REQUIRE(equal_false == false);
-
-    WHEN("a negative number is passed") {
-      REQUIRE_THROWS_AS(a.Set(-1), std::invalid_argument);
-    }
+    Task ta, tb;
+    TaskID a(&ta);
+    TaskID b(&tb);
+    TaskID c = a | b;
+    TaskID none;
+
+    REQUIRE(none.empty() == true);
+    REQUIRE(c.GetIDs().size() == 2);
   }
 }
diff --git a/tst/unit/test_tasklist.cpp b/tst/unit/test_tasklist.cpp
index f06ce49c3e99..1790a4eb0ad0 100644
--- a/tst/unit/test_tasklist.cpp
+++ b/tst/unit/test_tasklist.cpp
@@ -19,7 +19,7 @@
 
 // Internal Includes
 #include "basic_types.hpp"
-#include "tasks/task_list.hpp"
+#include "tasks/tasks.hpp"
 
 using parthenon::TaskID;
 using parthenon::TaskList;

From 047ff2e66d1586fafac329ca3e69ab84356d4869 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Mon, 11 Mar 2024 16:59:36 -0600
Subject: [PATCH 29/39] remove all reg_dep_id

---
 example/poisson_gmg/poisson_driver.cpp |  5 ++---
 src/solvers/bicgstab_solver.hpp        |  6 +++---
 src/solvers/mg_solver.hpp              | 14 ++++++--------
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/example/poisson_gmg/poisson_driver.cpp b/example/poisson_gmg/poisson_driver.cpp
index dfe177b26310..0a09d0eb931d 100644
--- a/example/poisson_gmg/poisson_driver.cpp
+++ b/example/poisson_gmg/poisson_driver.cpp
@@ -79,7 +79,6 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) {
 
   const int num_partitions = pmesh->DefaultNumPartitions();
   TaskRegion &region = tc.AddRegion(num_partitions);
-  int reg_dep_id = 0;
   for (int i = 0; i < num_partitions; ++i) {
     TaskList &tl = region[i];
     auto &md = pmesh->mesh_data.GetOrAdd("base", i);
@@ -100,10 +99,10 @@ TaskCollection PoissonDriver::MakeTaskCollection(BlockList_t &blocks) {
 
     auto solve = zero_u;
     if (solver == "BiCGSTAB") {
-      auto setup = bicgstab_solver->AddSetupTasks(region, tl, zero_u, i, reg_dep_id, pmesh);
+      auto setup = bicgstab_solver->AddSetupTasks(tl, zero_u, i, pmesh);
       solve = bicgstab_solver->AddTasks(tl, setup, pmesh, i);
     } else if (solver == "MG") {
-      auto setup = mg_solver->AddSetupTasks(region, tl, zero_u, i, reg_dep_id, pmesh);
+      auto setup = mg_solver->AddSetupTasks(tl, zero_u, i, pmesh);
       solve = mg_solver->AddTasks(tl, setup, pmesh, i);
     } else {
       PARTHENON_FAIL("Unknown solver type.");
diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index 951a4c78d7f8..f9ec7a5c7d12 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -87,9 +87,9 @@ class BiCGSTABSolver {
   }
 
   template <class TL_t>
-  TaskID AddSetupTasks(TaskRegion &region, TL_t &tl, TaskID dependence,
-                                int partition, int &reg_dep_id, Mesh *pmesh) { 
-    return preconditioner.AddSetupTasks(region, tl, dependence, partition, reg_dep_id, pmesh);
+  TaskID AddSetupTasks(TL_t &tl, TaskID dependence,
+                       int partition, Mesh *pmesh) { 
+    return preconditioner.AddSetupTasks(tl, dependence, partition, pmesh);
   }
 
   TaskID AddTasks(TaskList &tl, TaskID dependence, Mesh *pmesh, const int partition) {
diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 655f0c4eed44..5aa7766e99d7 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -147,15 +147,15 @@ class MGSolver {
   }
   
   template <class TL_t>
-  TaskID AddSetupTasks(TaskRegion &region, TL_t &tl, TaskID dependence,
-                                int partition, int &reg_dep_id, Mesh *pmesh) {
+  TaskID AddSetupTasks(TL_t &tl, TaskID dependence,
+                       int partition, Mesh *pmesh) {
     using namespace utils;
 
     int min_level = 0;
     int max_level = pmesh->GetGMGMaxLevel();
 
-    return AddMultiGridSetupPartitionLevel(region, tl, dependence, partition, reg_dep_id,
-                                           max_level, min_level, max_level, pmesh);
+    return AddMultiGridSetupPartitionLevel(tl, dependence, partition, max_level,
+                                           min_level, max_level, pmesh);
   }
 
   Real GetSquaredResidualSum() const { return residual.val; }
@@ -297,8 +297,7 @@ class MGSolver {
   }
 
   template <class TL_t>
-  TaskID AddMultiGridSetupPartitionLevel(TaskRegion &region, TL_t &tl, TaskID dependence,
-                                         int partition, int &reg_dep_id, int level,
+  TaskID AddMultiGridSetupPartitionLevel(TL_t &tl, TaskID dependence, int partition, int level,
                                          int min_level, int max_level, Mesh *pmesh) {
     using namespace utils;
 
@@ -318,8 +317,7 @@ class MGSolver {
     if (level > min_level) {
       task_out =
           tl.AddTask(task_out, SendBoundBufs<BoundaryType::gmg_restrict_send>, md);
-      task_out = AddMultiGridSetupPartitionLevel(region, tl, task_out,
-                                                 partition, reg_dep_id, level - 1,
+      task_out = AddMultiGridSetupPartitionLevel(tl, task_out, partition, level - 1,
                                                  min_level, max_level, pmesh);
     }
 

From 3bb79fa169d39197cc4089ff02597dd794d479a7 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Mon, 11 Mar 2024 17:01:30 -0600
Subject: [PATCH 30/39] format and lint

---
 src/interface/mesh_data.cpp     |  4 +-
 src/solvers/bicgstab_solver.hpp | 17 ++++---
 src/solvers/mg_solver.hpp       | 83 +++++++++++++++++----------------
 src/solvers/solver_utils.hpp    | 42 +++++++++--------
 4 files changed, 77 insertions(+), 69 deletions(-)

diff --git a/src/interface/mesh_data.cpp b/src/interface/mesh_data.cpp
index a6bc08e02769..ab64a5d63941 100644
--- a/src/interface/mesh_data.cpp
+++ b/src/interface/mesh_data.cpp
@@ -25,9 +25,9 @@ void MeshData<T>::Initialize(const MeshData<T> *src,
   pmy_mesh_ = src->GetParentPointer();
   const int nblocks = src->NumBlocks();
   block_data_.resize(nblocks);
-  
+
   grid = src->grid;
-  if (grid.type == GridType::two_level_composite) { 
+  if (grid.type == GridType::two_level_composite) {
     int gmg_level = src->grid.logical_level - pmy_mesh_->GetGMGMinLogicalLevel();
     for (int i = 0; i < nblocks; i++) {
       block_data_[i] = pmy_mesh_->gmg_block_lists[gmg_level][i]->meshblock_data.Add(
diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index f9ec7a5c7d12..73e8debe8436 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -58,7 +58,7 @@ class BiCGSTABSolver {
   PARTHENON_INTERNALSOLVERVARIABLE(u, r);
   PARTHENON_INTERNALSOLVERVARIABLE(u, p);
   PARTHENON_INTERNALSOLVERVARIABLE(u, x);
-  
+
   std::vector<std::string> GetInternalVariableNames() const {
     std::vector<std::string> names{rhat0::name(), v::name(), h::name(), s::name(),
                                    t::name(),     r::name(), p::name(), x::name()};
@@ -87,8 +87,7 @@ class BiCGSTABSolver {
   }
 
   template <class TL_t>
-  TaskID AddSetupTasks(TL_t &tl, TaskID dependence,
-                       int partition, Mesh *pmesh) { 
+  TaskID AddSetupTasks(TL_t &tl, TaskID dependence, int partition, Mesh *pmesh) {
     return preconditioner.AddSetupTasks(tl, dependence, partition, pmesh);
   }
 
@@ -97,7 +96,8 @@ class BiCGSTABSolver {
     TaskID none;
     auto &md = pmesh->mesh_data.GetOrAdd("base", partition);
     std::string label = "bicg_comm_" + std::to_string(partition);
-    auto &md_comm = pmesh->mesh_data.AddShallow(label, md, std::vector<std::string>{u::name()});
+    auto &md_comm =
+        pmesh->mesh_data.AddShallow(label, md, std::vector<std::string>{u::name()});
     iter_counter = 0;
     bool multilevel = pmesh->multilevel;
 
@@ -143,7 +143,8 @@ class BiCGSTABSolver {
     }
 
     // 2. v <- A u
-    auto comm = AddBoundaryExchangeTasks<BoundaryType::any>(precon1, itl, md_comm, multilevel);
+    auto comm =
+        AddBoundaryExchangeTasks<BoundaryType::any>(precon1, itl, md_comm, multilevel);
     auto get_v = eqs_.template Ax<u, v>(itl, comm, md);
 
     // 3. rhat0v <- (rhat0, v)
@@ -192,7 +193,8 @@ class BiCGSTABSolver {
     }
 
     // 7. t <- A u
-    auto pre_t_comm = AddBoundaryExchangeTasks<BoundaryType::any>(precon2, itl, md_comm, multilevel);
+    auto pre_t_comm =
+        AddBoundaryExchangeTasks<BoundaryType::any>(precon2, itl, md_comm, multilevel);
     auto get_t = eqs_.template Ax<u, t>(itl, pre_t_comm, md);
 
     // 8. omega <- (t,s) / (t,t)
@@ -248,7 +250,8 @@ class BiCGSTABSolver {
         this, md);
 
     // 14. rhat0r_old <- rhat0r, zero all reductions
-    Real *ptol = presidual_tolerance == nullptr ? &(params_.residual_tolerance) : presidual_tolerance;
+    Real *ptol = presidual_tolerance == nullptr ? &(params_.residual_tolerance)
+                                                : presidual_tolerance;
     auto check = itl.AddTask(
         TaskQualifier::completion | TaskQualifier::once_per_region |
             TaskQualifier::global_sync,
diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 5aa7766e99d7..b9c19c41a11d 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -13,6 +13,8 @@
 #ifndef SOLVERS_MG_SOLVER_HPP_
 #define SOLVERS_MG_SOLVER_HPP_
 
+#include <algorithm>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -145,10 +147,9 @@ class MGSolver {
     return AddMultiGridTasksPartitionLevel(tl, dependence, partition, max_level,
                                            min_level, max_level, pmesh);
   }
-  
+
   template <class TL_t>
-  TaskID AddSetupTasks(TL_t &tl, TaskID dependence,
-                       int partition, Mesh *pmesh) {
+  TaskID AddSetupTasks(TL_t &tl, TaskID dependence, int partition, Mesh *pmesh) {
     using namespace utils;
 
     int min_level = 0;
@@ -221,28 +222,29 @@ class MGSolver {
                 weight * v1 + (1.0 - weight) * pack(b, te, xold_t(1), k, j, i);
           });
     } else {
-      const int scratch_size = 0; 
+      const int scratch_size = 0;
       const int scratch_level = 0;
       parthenon::par_for_outer(
-          DEFAULT_OUTER_LOOP_PATTERN, "Jacobi", DevExecSpace(), scratch_size, scratch_level, 
-          0, pack.GetNBlocks() - 1, kb.s, kb.e, 
+          DEFAULT_OUTER_LOOP_PATTERN, "Jacobi", DevExecSpace(), scratch_size,
+          scratch_level, 0, pack.GetNBlocks() - 1, kb.s, kb.e,
           KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b, const int k) {
             const int nvars =
                 pack.GetUpperBound(b, xnew_t()) - pack.GetLowerBound(b, xnew_t()) + 1;
             for (int c = 0; c < nvars; ++c) {
-              Real *Ax = &pack(b, te, Axold_t(c), k, jb.s, ib.s); 
-              Real *diag = &pack(b, te, D_t(c), k, jb.s, ib.s); 
+              Real *Ax = &pack(b, te, Axold_t(c), k, jb.s, ib.s);
+              Real *diag = &pack(b, te, D_t(c), k, jb.s, ib.s);
               Real *prhs = &pack(b, te, rhs_t(c), k, jb.s, ib.s);
               Real *xo = &pack(b, te, xold_t(c), k, jb.s, ib.s);
               Real *xn = &pack(b, te, xnew_t(c), k, jb.s, ib.s);
-              const int npoints = (jb.e - jb.s + 1) 
-                                * (ib.e - ib.s + 1 + 2 * Globals::nghost) 
-                                - 2 * Globals::nghost;
-              parthenon::par_for_inner(DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1, [&](const int idx) {
-                const Real off_diag = Ax[idx] - diag[idx] * xo[idx];
-                const Real val = prhs[idx] - off_diag; 
-                xn[idx] = weight * val / diag[idx] + (1.0 - weight) * xo[idx];
-              });
+              const int npoints =
+                  (jb.e - jb.s + 1) * (ib.e - ib.s + 1 + 2 * Globals::nghost) -
+                  2 * Globals::nghost;
+              parthenon::par_for_inner(
+                  DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1, [&](const int idx) {
+                    const Real off_diag = Ax[idx] - diag[idx] * xo[idx];
+                    const Real val = prhs[idx] - off_diag;
+                    xn[idx] = weight * val / diag[idx] + (1.0 - weight) * xo[idx];
+                  });
             }
           });
     }
@@ -255,7 +257,8 @@ class MGSolver {
                             std::shared_ptr<MeshData<Real>> &md_comm) {
     using namespace utils;
 
-    auto comm = AddBoundaryExchangeTasks<comm_boundary>(depends_on, tl, md_comm, multilevel);
+    auto comm =
+        AddBoundaryExchangeTasks<comm_boundary>(depends_on, tl, md_comm, multilevel);
     auto mat_mult = eqs_.template Ax<in_t, out_t>(tl, comm, md);
     return tl.AddTask(mat_mult, &MGSolver::Jacobi<rhs, out_t, D, in_t, out_t>, this, md,
                       omega);
@@ -283,22 +286,23 @@ class MGSolver {
     // This copy is to set the coarse blocks in temp to the values in u so that
     // fine-coarse boundaries of temp are correctly updated during communication
     depends_on = tl.AddTask(depends_on, CopyData<u, temp, false>, md);
-    auto jacobi1 = AddJacobiIteration<comm_boundary, u, temp>(tl, depends_on, multilevel,
-                                                              omega[ndim - 1][0], md, md_comm);
+    auto jacobi1 = AddJacobiIteration<comm_boundary, u, temp>(
+        tl, depends_on, multilevel, omega[ndim - 1][0], md, md_comm);
     auto copy1 = tl.AddTask(jacobi1, CopyData<temp, u, true>, md);
     if (stages < 2) return copy1;
-    auto jacobi2 = AddJacobiIteration<comm_boundary, u, temp>(tl, copy1, multilevel,
-                                                              omega[ndim - 1][1], md, md_comm);
+    auto jacobi2 = AddJacobiIteration<comm_boundary, u, temp>(
+        tl, copy1, multilevel, omega[ndim - 1][1], md, md_comm);
     auto copy2 = tl.AddTask(jacobi2, CopyData<temp, u, true>, md);
     if (stages < 3) return copy2;
-    auto jacobi3 = AddJacobiIteration<comm_boundary, u, temp>(tl, copy2, multilevel,
-                                                              omega[ndim - 1][2], md, md_comm);
+    auto jacobi3 = AddJacobiIteration<comm_boundary, u, temp>(
+        tl, copy2, multilevel, omega[ndim - 1][2], md, md_comm);
     return tl.AddTask(jacobi3, CopyData<temp, u, true>, md);
   }
 
   template <class TL_t>
-  TaskID AddMultiGridSetupPartitionLevel(TL_t &tl, TaskID dependence, int partition, int level,
-                                         int min_level, int max_level, Mesh *pmesh) {
+  TaskID AddMultiGridSetupPartitionLevel(TL_t &tl, TaskID dependence, int partition,
+                                         int level, int min_level, int max_level,
+                                         Mesh *pmesh) {
     using namespace utils;
 
     bool multilevel = (level != min_level);
@@ -309,14 +313,12 @@ class MGSolver {
     if (level < max_level) {
       task_out =
           tl.AddTask(task_out, ReceiveBoundBufs<BoundaryType::gmg_restrict_recv>, md);
-      task_out =
-          tl.AddTask(task_out, SetBounds<BoundaryType::gmg_restrict_recv>, md);
+      task_out = tl.AddTask(task_out, SetBounds<BoundaryType::gmg_restrict_recv>, md);
     }
 
     // If we are finer than the coarsest level:
     if (level > min_level) {
-      task_out =
-          tl.AddTask(task_out, SendBoundBufs<BoundaryType::gmg_restrict_send>, md);
+      task_out = tl.AddTask(task_out, SendBoundBufs<BoundaryType::gmg_restrict_send>, md);
       task_out = AddMultiGridSetupPartitionLevel(tl, task_out, partition, level - 1,
                                                  min_level, max_level, pmesh);
     }
@@ -352,14 +354,15 @@ class MGSolver {
 
     auto &md = pmesh->gmg_mesh_data[level].GetOrAdd(level, "base", partition);
     std::string label = "comm_" + std::to_string(level) + "_" + std::to_string(partition);
-    auto &md_comm = pmesh->gmg_mesh_data[level].AddShallow(label, md, std::vector<std::string>{u::name(), res_err::name()});
+    auto &md_comm = pmesh->gmg_mesh_data[level].AddShallow(
+        label, md, std::vector<std::string>{u::name(), res_err::name()});
 
     // 0. Receive residual from coarser level if there is one
     auto set_from_finer = dependence;
     if (level < max_level) {
       // Fill fields with restricted values
-      auto recv_from_finer =
-          tl.AddTask(dependence, ReceiveBoundBufs<BoundaryType::gmg_restrict_recv>, md_comm);
+      auto recv_from_finer = tl.AddTask(
+          dependence, ReceiveBoundBufs<BoundaryType::gmg_restrict_recv>, md_comm);
       set_from_finer = tl.AddTask( // TaskQualifier::local_sync, // is this required?
           recv_from_finer, SetBounds<BoundaryType::gmg_restrict_recv>, md_comm);
       // 1. Copy residual from dual purpose communication field to the rhs, should be
@@ -391,14 +394,14 @@ class MGSolver {
     // 2. Do pre-smooth and fill solution on this level
     set_from_finer =
         tl.AddTask(set_from_finer, &equations::template SetDiagonal<D>, &eqs_, md);
-    auto pre_smooth = AddSRJIteration<BoundaryType::gmg_same>(tl, set_from_finer,
-                                                              pre_stages, multilevel, md, md_comm);
+    auto pre_smooth = AddSRJIteration<BoundaryType::gmg_same>(
+        tl, set_from_finer, pre_stages, multilevel, md, md_comm);
     // If we are finer than the coarsest level:
     auto post_smooth = pre_smooth;
     if (level > min_level) {
       // 3. Communicate same level boundaries so that u is up to date everywhere
-      auto comm_u = AddBoundaryExchangeTasks<BoundaryType::gmg_same>(pre_smooth, tl, md_comm,
-                                                                     multilevel);
+      auto comm_u = AddBoundaryExchangeTasks<BoundaryType::gmg_same>(pre_smooth, tl,
+                                                                     md_comm, multilevel);
 
       // 4. Caclulate residual and store in communication field
       auto residual = eqs_.template Ax<u, temp>(tl, comm_u, md);
@@ -414,10 +417,10 @@ class MGSolver {
           tl, communicate_to_coarse, partition, level - 1, min_level, max_level, pmesh);
 
       // 6. Receive error field into communication field and prolongate
-      auto recv_from_coarser =
-          tl.AddTask(coarser, ReceiveBoundBufs<BoundaryType::gmg_prolongate_recv>, md_comm);
-      auto set_from_coarser =
-          tl.AddTask(recv_from_coarser, SetBounds<BoundaryType::gmg_prolongate_recv>, md_comm);
+      auto recv_from_coarser = tl.AddTask(
+          coarser, ReceiveBoundBufs<BoundaryType::gmg_prolongate_recv>, md_comm);
+      auto set_from_coarser = tl.AddTask(
+          recv_from_coarser, SetBounds<BoundaryType::gmg_prolongate_recv>, md_comm);
       auto prolongate = tl.AddTask( // TaskQualifier::local_sync, // is this required?
           set_from_coarser, ProlongateBounds<BoundaryType::gmg_prolongate_recv>, md_comm);
 
diff --git a/src/solvers/solver_utils.hpp b/src/solvers/solver_utils.hpp
index a5ad37934619..38b7cafd0ee3 100644
--- a/src/solvers/solver_utils.hpp
+++ b/src/solvers/solver_utils.hpp
@@ -13,6 +13,8 @@
 #ifndef SOLVERS_SOLVER_UTILS_HPP_
 #define SOLVERS_SOLVER_UTILS_HPP_
 
+#include <algorithm>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -156,20 +158,19 @@ TaskStatus CopyData(const std::shared_ptr<MeshData<Real>> &md) {
 
   static auto desc = parthenon::MakePackDescriptor<in_t, out_t>(md.get());
   auto pack = desc.GetPack(md.get(), only_fine_on_composite);
-  const int scratch_size = 0; 
+  const int scratch_size = 0;
   const int scratch_level = 0;
   parthenon::par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "CopyData", DevExecSpace(), scratch_size, scratch_level, 
-      0, pack.GetNBlocks() - 1, 
-      KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b) {
-        const int nvars = pack.GetUpperBound(b, in_t()) - pack.GetLowerBound(b, in_t()) + 1;
-        const int npoints = (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1); 
+      DEFAULT_OUTER_LOOP_PATTERN, "CopyData", DevExecSpace(), scratch_size, scratch_level,
+      0, pack.GetNBlocks() - 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b) {
+        const int nvars =
+            pack.GetUpperBound(b, in_t()) - pack.GetLowerBound(b, in_t()) + 1;
+        const int npoints = (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1);
         for (int c = 0; c < nvars; ++c) {
           Real *in = &pack(b, te, in_t(c), kb.s, jb.s, ib.s);
           Real *out = &pack(b, te, out_t(c), kb.s, jb.s, ib.s);
-          parthenon::par_for_inner(DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1, [&](const int idx) {
-            out[idx] = in[idx];
-          });
+          parthenon::par_for_inner(DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1,
+                                   [&](const int idx) { out[idx] = in[idx]; });
         }
       });
   return TaskStatus::complete;
@@ -195,11 +196,11 @@ TaskStatus AddFieldsAndStoreInteriorSelect(const std::shared_ptr<MeshData<Real>>
 
   static auto desc = parthenon::MakePackDescriptor<a_t, b_t, out_t>(md.get());
   auto pack = desc.GetPack(md.get(), include_block, only_fine_on_composite);
-  const int scratch_size = 0; 
+  const int scratch_size = 0;
   const int scratch_level = 0;
   parthenon::par_for_outer(
-      DEFAULT_OUTER_LOOP_PATTERN, "AddFieldsAndStore", DevExecSpace(), scratch_size, scratch_level, 
-      0, pack.GetNBlocks() - 1,
+      DEFAULT_OUTER_LOOP_PATTERN, "AddFieldsAndStore", DevExecSpace(), scratch_size,
+      scratch_level, 0, pack.GetNBlocks() - 1,
       KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b) {
         const int nvars = pack.GetUpperBound(b, a_t()) - pack.GetLowerBound(b, a_t()) + 1;
         const int npoints = (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1);
@@ -207,9 +208,9 @@ TaskStatus AddFieldsAndStoreInteriorSelect(const std::shared_ptr<MeshData<Real>>
           Real *avar = &pack(b, te, a_t(c), kb.s, jb.s, ib.s);
           Real *bvar = &pack(b, te, b_t(c), kb.s, jb.s, ib.s);
           Real *out = &pack(b, te, out_t(c), kb.s, jb.s, ib.s);
-          parthenon::par_for_inner(DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1, [&](const int idx) {
-            out[idx] = wa * avar[idx] + wb * bvar[idx];
-          });
+          parthenon::par_for_inner(
+              DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1,
+              [&](const int idx) { out[idx] = wa * avar[idx] + wb * bvar[idx]; });
         }
       });
   return TaskStatus::complete;
@@ -303,7 +304,7 @@ TaskID DotProduct(TaskID dependency_in, TaskList &tl, AllReduce<Real> *adotb,
 
 template <class a_t>
 TaskStatus GlobalMinLocal(const std::shared_ptr<MeshData<Real>> &md,
-                           AllReduce<Real> *amin) {
+                          AllReduce<Real> *amin) {
   using TE = parthenon::TopologicalElement;
   TE te = TE::CC;
   IndexRange ib = md->GetBoundsI(IndexDomain::interior, te);
@@ -331,7 +332,7 @@ TaskStatus GlobalMinLocal(const std::shared_ptr<MeshData<Real>> &md,
 
 template <class a_t>
 TaskID GlobalMin(TaskID dependency_in, TaskList &tl, AllReduce<Real> *amin,
-                  const std::shared_ptr<MeshData<Real>> &md) {
+                 const std::shared_ptr<MeshData<Real>> &md) {
   using namespace impl;
   auto max_amin = tl.AddTask(
       TaskQualifier::once_per_region | TaskQualifier::local_sync, dependency_in,
@@ -340,10 +341,11 @@ TaskID GlobalMin(TaskID dependency_in, TaskList &tl, AllReduce<Real> *amin,
         return TaskStatus::complete;
       },
       amin);
-  auto get_amin = tl.AddTask(TaskQualifier::local_sync, max_amin, GlobalMinLocal<a_t>, md, amin);
-  auto start_global_amin = tl.AddTask(TaskQualifier::once_per_region, get_amin, 
+  auto get_amin =
+      tl.AddTask(TaskQualifier::local_sync, max_amin, GlobalMinLocal<a_t>, md, amin);
+  auto start_global_amin = tl.AddTask(TaskQualifier::once_per_region, get_amin,
                                       &AllReduce<Real>::StartReduce, amin, MPI_MIN);
-  return tl.AddTask(TaskQualifier::once_per_region | TaskQualifier::local_sync, 
+  return tl.AddTask(TaskQualifier::once_per_region | TaskQualifier::local_sync,
                     start_global_amin, &AllReduce<Real>::CheckReduce, amin);
 }
 

From 029a29a06bf969236233c4f4fbce07419793fa1c Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Mon, 11 Mar 2024 17:13:58 -0600
Subject: [PATCH 31/39] format

---
 src/solvers/mg_solver.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 0e423e5b2be7..b9c19c41a11d 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -163,8 +163,7 @@ class MGSolver {
   int GetCurrentIterations() const { return iter_counter; }
   Real GetFinalResidual() const { return final_residual; }
   int GetFinalIterations() const { return final_iteration; }
- 
- 
+
  protected:
   MGParams params_;
   int iter_counter;

From 0959179d620449603825282377ffcdf8803cc1ae Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Mon, 11 Mar 2024 17:18:26 -0600
Subject: [PATCH 32/39] Update copyrights

---
 src/bvals/boundary_conditions_generic.hpp | 2 +-
 src/interface/mesh_data.cpp               | 2 +-
 src/solvers/bicgstab_solver.hpp           | 2 +-
 src/solvers/mg_solver.hpp                 | 2 +-
 src/solvers/solver_utils.hpp              | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/bvals/boundary_conditions_generic.hpp b/src/bvals/boundary_conditions_generic.hpp
index 57efdb901105..12a798403d57 100644
--- a/src/bvals/boundary_conditions_generic.hpp
+++ b/src/bvals/boundary_conditions_generic.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
diff --git a/src/interface/mesh_data.cpp b/src/interface/mesh_data.cpp
index ab64a5d63941..27a4dc520180 100644
--- a/src/interface/mesh_data.cpp
+++ b/src/interface/mesh_data.cpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2020-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2020-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index 73e8debe8436..446609908ea2 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2023-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index b9c19c41a11d..36e9483a0e71 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2023-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
diff --git a/src/solvers/solver_utils.hpp b/src/solvers/solver_utils.hpp
index 38b7cafd0ee3..1b170edce812 100644
--- a/src/solvers/solver_utils.hpp
+++ b/src/solvers/solver_utils.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2021. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2021-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC

From c3b65fe224293e4f9da8dd90392d65eef6330113 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Mon, 11 Mar 2024 17:18:49 -0600
Subject: [PATCH 33/39] update copyrights

---
 example/poisson_gmg/poisson_driver.hpp  | 2 +-
 example/poisson_gmg/poisson_package.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/poisson_gmg/poisson_driver.hpp b/example/poisson_gmg/poisson_driver.hpp
index 348946ab2fb7..17a3cf6989c9 100644
--- a/example/poisson_gmg/poisson_driver.hpp
+++ b/example/poisson_gmg/poisson_driver.hpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2021-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2021-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC
diff --git a/example/poisson_gmg/poisson_package.cpp b/example/poisson_gmg/poisson_package.cpp
index 9c4d3ef27fe1..fddc087a4524 100644
--- a/example/poisson_gmg/poisson_package.cpp
+++ b/example/poisson_gmg/poisson_package.cpp
@@ -1,5 +1,5 @@
 //========================================================================================
-// (C) (or copyright) 2021-2023. Triad National Security, LLC. All rights reserved.
+// (C) (or copyright) 2021-2024. Triad National Security, LLC. All rights reserved.
 //
 // This program was produced under U.S. Government contract 89233218CNA000001 for Los
 // Alamos National Laboratory (LANL), which is operated by Triad National Security, LLC

From dbfd1e3618e0c7e9185d1d88834c0e22e2282450 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Mon, 11 Mar 2024 17:27:27 -0600
Subject: [PATCH 34/39] update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 74cd391301e8..df35bc7572b3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@
 - [[PR 996]](https://github.com/parthenon-hpc-lab/parthenon/pull/996) Remove dynamic allocations from swarm particle creation
 
 ### Changed (changing behavior/API/variables/...)
+- [[PR 973]](https://github.com/parthenon-hpc-lab/parthenon/pull/973) Multigrid performance upgrades
 
 ### Fixed (not changing behavior/API/variables/...)
 - [[PR1012]](https://github.com/parthenon-hpc-lab/parthenon/pull/1012) Remove accidentally duplicated code

From 46d6b0fecf48c0ffcd64565e82558268a41d992d Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Mon, 11 Mar 2024 18:44:41 -0600
Subject: [PATCH 35/39] fix gpu bug

---
 src/solvers/mg_solver.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 36e9483a0e71..216bf6f9fccb 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -224,6 +224,7 @@ class MGSolver {
     } else {
       const int scratch_size = 0;
       const int scratch_level = 0;
+      const int nghost = Globals::nghost;
       parthenon::par_for_outer(
           DEFAULT_OUTER_LOOP_PATTERN, "Jacobi", DevExecSpace(), scratch_size,
           scratch_level, 0, pack.GetNBlocks() - 1, kb.s, kb.e,
@@ -237,8 +238,8 @@ class MGSolver {
               Real *xo = &pack(b, te, xold_t(c), k, jb.s, ib.s);
               Real *xn = &pack(b, te, xnew_t(c), k, jb.s, ib.s);
               const int npoints =
-                  (jb.e - jb.s + 1) * (ib.e - ib.s + 1 + 2 * Globals::nghost) -
-                  2 * Globals::nghost;
+                  (jb.e - jb.s + 1) * (ib.e - ib.s + 1 + 2 * nghost) -
+                  2 * nghost;
               parthenon::par_for_inner(
                   DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1, [&](const int idx) {
                     const Real off_diag = Ax[idx] - diag[idx] * xo[idx];

From 2327afeb940fca999c43a10d03c37bb8180626bb Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Mon, 11 Mar 2024 19:01:06 -0600
Subject: [PATCH 36/39] format

---
 src/solvers/mg_solver.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 216bf6f9fccb..65bd82622bc1 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -238,8 +238,7 @@ class MGSolver {
               Real *xo = &pack(b, te, xold_t(c), k, jb.s, ib.s);
               Real *xn = &pack(b, te, xnew_t(c), k, jb.s, ib.s);
               const int npoints =
-                  (jb.e - jb.s + 1) * (ib.e - ib.s + 1 + 2 * nghost) -
-                  2 * nghost;
+                  (jb.e - jb.s + 1) * (ib.e - ib.s + 1 + 2 * nghost) - 2 * nghost;
               parthenon::par_for_inner(
                   DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1, [&](const int idx) {
                     const Real off_diag = Ax[idx] - diag[idx] * xo[idx];

From e74bc12d5384e5fee65b0e0e48756ff4fb187e0a Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 12 Mar 2024 18:16:23 -0600
Subject: [PATCH 37/39] bugfix

---
 src/solvers/bicgstab_solver.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index 446609908ea2..a04d06635ee9 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -276,7 +276,7 @@ class BiCGSTABSolver {
         },
         this, pmesh, params_.max_iters, ptol);
 
-    return tl.AddTask(check, CopyData<x, u>, md);
+    return tl.AddTask(solver_id, CopyData<x, u>, md);
   }
 
   Real GetSquaredResidualSum() const { return residual.val; }

From d419c1ed2fb3bb0bc743c0ddd4de31eeaf575364 Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Thu, 14 Mar 2024 13:21:03 -0600
Subject: [PATCH 38/39] make solver work as inner iterative task list

---
 src/solvers/bicgstab_solver.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/solvers/bicgstab_solver.hpp b/src/solvers/bicgstab_solver.hpp
index a04d06635ee9..2cc6bf7f7033 100644
--- a/src/solvers/bicgstab_solver.hpp
+++ b/src/solvers/bicgstab_solver.hpp
@@ -119,6 +119,7 @@ class BiCGSTABSolver {
           solver->ts.val = 0.0;
           solver->tt.val = 0.0;
           solver->residual.val = 0.0;
+          solver->iter_counter = 0;
           return TaskStatus::complete;
         },
         this);

From dddd0a2872ef344e880fe220ce1fcf535d01280c Mon Sep 17 00:00:00 2001
From: Luke Roberts <lfroberts@lanl.gov>
Date: Tue, 19 Mar 2024 14:34:29 -0600
Subject: [PATCH 39/39] Respond to Philipp's comments

---
 example/poisson_gmg/main.cpp |  2 ++
 src/solvers/mg_solver.hpp    | 10 ++++++----
 src/solvers/solver_utils.hpp | 11 +++++++----
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/example/poisson_gmg/main.cpp b/example/poisson_gmg/main.cpp
index b237393b1b8f..87e53aea72e1 100644
--- a/example/poisson_gmg/main.cpp
+++ b/example/poisson_gmg/main.cpp
@@ -51,6 +51,8 @@ int main(int argc, char *argv[]) {
     if (driver_status != parthenon::DriverStatus::complete ||
         driver.final_rms_residual > 1.e-10 || driver.final_rms_error > 1.e-12)
       success = false;
+    if (driver.final_rms_residual != driver.final_rms_residual) success = false;
+    if (driver.final_rms_error != driver.final_rms_error) success = false;
   }
   // call MPI_Finalize and Kokkos::finalize if necessary
   pman.ParthenonFinalize();
diff --git a/src/solvers/mg_solver.hpp b/src/solvers/mg_solver.hpp
index 65bd82622bc1..153220a4a4cf 100644
--- a/src/solvers/mg_solver.hpp
+++ b/src/solvers/mg_solver.hpp
@@ -111,7 +111,8 @@ class MGSolver {
         &iter_counter);
     auto mg_finest = AddLinearOperatorTasks(itl, none, partition, pmesh);
     auto &md = pmesh->mesh_data.GetOrAdd("base", partition);
-    auto comm = AddBoundaryExchangeTasks<BoundaryType::any>(mg_finest, itl, md, true);
+    auto comm = AddBoundaryExchangeTasks<BoundaryType::any>(mg_finest, itl, md,
+                                                            pmesh->multilevel);
     auto calc_pointwise_res = eqs_.template Ax<u, res_err>(itl, comm, md);
     calc_pointwise_res = itl.AddTask(
         calc_pointwise_res, AddFieldsAndStoreInteriorSelect<rhs, res_err, res_err>, md,
@@ -152,7 +153,7 @@ class MGSolver {
   TaskID AddSetupTasks(TL_t &tl, TaskID dependence, int partition, Mesh *pmesh) {
     using namespace utils;
 
-    int min_level = 0;
+    int min_level = std::max(pmesh->GetGMGMaxLevel() - params_.max_coarsenings, 0);
     int max_level = pmesh->GetGMGMaxLevel();
 
     return AddMultiGridSetupPartitionLevel(tl, dependence, partition, max_level,
@@ -237,8 +238,9 @@ class MGSolver {
               Real *prhs = &pack(b, te, rhs_t(c), k, jb.s, ib.s);
               Real *xo = &pack(b, te, xold_t(c), k, jb.s, ib.s);
               Real *xn = &pack(b, te, xnew_t(c), k, jb.s, ib.s);
-              const int npoints =
-                  (jb.e - jb.s + 1) * (ib.e - ib.s + 1 + 2 * nghost) - 2 * nghost;
+              // Use ptr arithmetic to get the number of points we need to go over
+              // (including ghost zones) to get from (k, jb.s, ib.s) to (k, jb.e, ib.e)
+              const int npoints = &pack(b, te, Axold_t(c), k, jb.e, ib.e) - Ax + 1;
               parthenon::par_for_inner(
                   DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1, [&](const int idx) {
                     const Real off_diag = Ax[idx] - diag[idx] * xo[idx];
diff --git a/src/solvers/solver_utils.hpp b/src/solvers/solver_utils.hpp
index 1b170edce812..871462d11f31 100644
--- a/src/solvers/solver_utils.hpp
+++ b/src/solvers/solver_utils.hpp
@@ -160,16 +160,18 @@ TaskStatus CopyData(const std::shared_ptr<MeshData<Real>> &md) {
   auto pack = desc.GetPack(md.get(), only_fine_on_composite);
   const int scratch_size = 0;
   const int scratch_level = 0;
+  // Warning: This inner loop strategy only works because we are using IndexDomain::entire
+  const int npoints_inner = (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1);
   parthenon::par_for_outer(
       DEFAULT_OUTER_LOOP_PATTERN, "CopyData", DevExecSpace(), scratch_size, scratch_level,
       0, pack.GetNBlocks() - 1, KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b) {
         const int nvars =
             pack.GetUpperBound(b, in_t()) - pack.GetLowerBound(b, in_t()) + 1;
-        const int npoints = (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1);
         for (int c = 0; c < nvars; ++c) {
           Real *in = &pack(b, te, in_t(c), kb.s, jb.s, ib.s);
           Real *out = &pack(b, te, out_t(c), kb.s, jb.s, ib.s);
-          parthenon::par_for_inner(DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1,
+          parthenon::par_for_inner(DEFAULT_INNER_LOOP_PATTERN, member, 0,
+                                   npoints_inner - 1,
                                    [&](const int idx) { out[idx] = in[idx]; });
         }
       });
@@ -198,18 +200,19 @@ TaskStatus AddFieldsAndStoreInteriorSelect(const std::shared_ptr<MeshData<Real>>
   auto pack = desc.GetPack(md.get(), include_block, only_fine_on_composite);
   const int scratch_size = 0;
   const int scratch_level = 0;
+  // Warning: This inner loop strategy only works because we are using IndexDomain::entire
+  const int npoints_inner = (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1);
   parthenon::par_for_outer(
       DEFAULT_OUTER_LOOP_PATTERN, "AddFieldsAndStore", DevExecSpace(), scratch_size,
       scratch_level, 0, pack.GetNBlocks() - 1,
       KOKKOS_LAMBDA(parthenon::team_mbr_t member, const int b) {
         const int nvars = pack.GetUpperBound(b, a_t()) - pack.GetLowerBound(b, a_t()) + 1;
-        const int npoints = (kb.e - kb.s + 1) * (jb.e - jb.s + 1) * (ib.e - ib.s + 1);
         for (int c = 0; c < nvars; ++c) {
           Real *avar = &pack(b, te, a_t(c), kb.s, jb.s, ib.s);
           Real *bvar = &pack(b, te, b_t(c), kb.s, jb.s, ib.s);
           Real *out = &pack(b, te, out_t(c), kb.s, jb.s, ib.s);
           parthenon::par_for_inner(
-              DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints - 1,
+              DEFAULT_INNER_LOOP_PATTERN, member, 0, npoints_inner - 1,
               [&](const int idx) { out[idx] = wa * avar[idx] + wb * bvar[idx]; });
         }
       });