From 6e2b831245f6fdac9a714c64417bd5ab21fb613d Mon Sep 17 00:00:00 2001
From: Edward Basso <edwardebasso@gmail.com>
Date: Mon, 13 Nov 2023 15:00:36 -0800
Subject: [PATCH 1/4] solve_bicgstab: use linop.make instead of MF constructor
 (#3619)

## Summary

This PR replaces the explicit use of MF constructors in
```MLCGSolverT<MF>::solve_bicgstab``` with calls to the `make` method of
the linear operator associated with the MLCGSolverT object.

## Additional background

The use of `MLLinOpT<MF>::make` allows for inheritance of MLCGSolverT
without an override of `solve_bicgstab` even if the MF class lacks a
constructor with the same arguments as those MultiFab. For the MLMG
template classes, `make` should generally be used instead of explicit MF
constructors. Another PR to change this in `solve_cg` will follow once
this is fully vetted and approved.
---
 Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
index 3764fa38f8a..fce7b1d5005 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
@@ -90,22 +90,18 @@ MLCGSolverT<MF>::solve_bicgstab (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
     const int ncomp = sol.nComp();
 
-    const BoxArray& ba = sol.boxArray();
-    const DistributionMapping& dm = sol.DistributionMap();
-    const auto& factory = sol.Factory();
-
-    MF ph(ba, dm, ncomp, sol.nGrowVect(), MFInfo(), factory);
-    MF sh(ba, dm, ncomp, sol.nGrowVect(), MFInfo(), factory);
+    MF ph = Lp.make(amrlev, mglev, sol.nGrowVect());
+    MF sh = Lp.make(amrlev, mglev, sol.nGrowVect());
     ph.setVal(RT(0.0));
     sh.setVal(RT(0.0));
 
-    MF sorig(ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF p    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF r    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF s    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF rh   (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF v    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF t    (ba, dm, ncomp, nghost, MFInfo(), factory);
+    MF sorig = Lp.make(amrlev, mglev, nghost);
+    MF p     = Lp.make(amrlev, mglev, nghost);
+    MF r     = Lp.make(amrlev, mglev, nghost);
+    MF s     = Lp.make(amrlev, mglev, nghost);
+    MF rh    = Lp.make(amrlev, mglev, nghost);
+    MF v     = Lp.make(amrlev, mglev, nghost);
+    MF t     = Lp.make(amrlev, mglev, nghost);
 
     Lp.correctionResidual(amrlev, mglev, r, sol, rhs, MLLinOpT<MF>::BCMode::Homogeneous);
 

From fa3743fd1fdd5e3f1b12d431b0f6bb4b15bb7b95 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Mon, 13 Nov 2023 15:00:56 -0800
Subject: [PATCH 2/4] CArena: shrink_in_place and operator<< (#3621)

## Summary

Implement CArena::shrink_in_place, which is used by
PODVector::shrink_to_fit. It avoids a new memory allocation and data
movement.

Add operator<< to CArena. This helps debugging.

## Additional background

Follow-up on #3426.

## Checklist

The proposed changes:
- [ ] fix a bug or incorrect behavior in AMReX
- [x] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate
---
 Src/Base/AMReX_CArena.H   | 19 ++++----
 Src/Base/AMReX_CArena.cpp | 96 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 105 insertions(+), 10 deletions(-)

diff --git a/Src/Base/AMReX_CArena.H b/Src/Base/AMReX_CArena.H
index d68285bc878..163039df2ef 100644
--- a/Src/Base/AMReX_CArena.H
+++ b/Src/Base/AMReX_CArena.H
@@ -5,13 +5,14 @@
 #include <AMReX_Arena.H>
 
 #include <cstddef>
-#include <set>
-#include <vector>
+#include <functional>
+#include <iosfwd>
 #include <map>
 #include <mutex>
-#include <unordered_set>
-#include <functional>
+#include <set>
 #include <string>
+#include <unordered_set>
+#include <vector>
 
 namespace amrex {
 
@@ -57,7 +58,7 @@ public:
      * Try to shrink in-place
      */
     [[nodiscard]] void*
-    shrink_in_place (void* pt, std::size_t sz) final;
+    shrink_in_place (void* pt, std::size_t new_size) final;
 
     /**
     * \brief Free up allocated memory.  Merge neighboring free memory chunks
@@ -164,15 +165,15 @@ protected:
         MemStat* m_stat;
     };
 
+    //! The list of blocks allocated via ::operator new().
+    std::vector<std::pair<void*,std::size_t> > m_alloc;
+
     /**
     * \brief The type of our freelist and blocklist.
     * We use a set sorted from lo to hi memory addresses.
     */
     using NL = std::set<Node>;
 
-    //! The list of blocks allocated via ::operator new().
-    std::vector<std::pair<void*,std::size_t> > m_alloc;
-
     /**
     * \brief The free list of allocated but not currently used blocks.
     * Maintained in lo to hi memory sorted order.
@@ -198,6 +199,8 @@ protected:
 
 
     std::mutex carena_mutex;
+
+    friend std::ostream& operator<< (std::ostream& os, const CArena& arena);
 };
 
 }
diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp
index 6f7979d4750..c47f8f5ed26 100644
--- a/Src/Base/AMReX_CArena.cpp
+++ b/Src/Base/AMReX_CArena.cpp
@@ -14,6 +14,7 @@ namespace amrex {
 
 #include <utility>
 #include <cstring>
+#include <iostream>
 
 namespace amrex {
 
@@ -203,9 +204,61 @@ CArena::alloc_in_place (void* pt, std::size_t szmin, std::size_t szmax)
 }
 
 void*
-CArena::shrink_in_place (void* /*pt*/, std::size_t sz)
+CArena::shrink_in_place (void* pt, std::size_t new_size)
 {
-    return alloc(sz); // xxxxx TODO
+    if ((pt == nullptr) || (new_size == 0)) { return nullptr; }
+
+    new_size = Arena::align(new_size);
+
+    std::lock_guard<std::mutex> lock(carena_mutex);
+
+    auto busy_it = m_busylist.find(Node(pt,nullptr,0));
+    if (busy_it == m_busylist.end()) {
+        amrex::Abort("CArena::shrink_in_place: unknown pointer");
+        return nullptr;
+    }
+    AMREX_ASSERT(m_freelist.find(*busy_it) == m_freelist.end());
+
+    auto const old_size = busy_it->size();
+
+    if (new_size > old_size) {
+        amrex::Abort("CArena::shrink_in_place: wrong size. Cannot shrink to a larger size.");
+        return nullptr;
+    } else if (new_size == old_size) {
+        return pt;
+    } else {
+        auto const leftover_size = old_size - new_size;
+
+        void* pt2 = static_cast<char*>(pt) + new_size;
+        Node new_free_node(pt2, busy_it->owner(), leftover_size);
+
+        void* pt_end = static_cast<char*>(pt) + old_size;
+        auto free_it = m_freelist.find(Node(pt_end,nullptr,0));
+        if ((free_it == m_freelist.end()) || ! new_free_node.coalescable(*free_it)) {
+            m_freelist.insert(free_it, new_free_node);
+        } else {
+            auto& node = const_cast<Node&>(*free_it);
+            // This is safe because the free list is std::set and the
+            // modification of `block` does not change the order of elements
+            // in the container, even though Node's operator< uses block.
+            node.block(pt2);
+            node.size(leftover_size + node.size());
+        }
+
+        const_cast<Node&>(*busy_it).size(new_size);
+
+        m_actually_used -= leftover_size;
+
+#ifdef AMREX_TINY_PROFILING
+        if (m_do_profiling) {
+            TinyProfiler::memory_free(old_size, busy_it->mem_stat());
+            auto* stat = TinyProfiler::memory_alloc(new_size, m_profiling_stats);
+            const_cast<Node&>(*busy_it).mem_stat(stat);
+        }
+#endif
+
+        return pt;
+    }
 }
 
 void
@@ -439,4 +492,43 @@ CArena::PrintUsage (std::ostream& os, std::string const& name, std::string const
        << m_busylist.size() << " busy blocks, " << m_freelist.size() << " free blocks\n";
 }
 
+std::ostream& operator<< (std::ostream& os, const CArena& arena)
+{
+    os << "CArea:\n"
+       << "    Hunk size: " << arena.m_hunk << "\n"
+       << "    Memory allocated: " << arena.m_used << "\n"
+       << "    Memory actually used: " << arena.m_actually_used << "\n";
+
+    if (arena.m_alloc.empty()) {
+        os << "    No memory allocations\n";
+    } else {
+        os << "    List of memory alloations: (address, size)\n";
+        for (auto const& a : arena.m_alloc) {
+            os << "        " << a.first << ", " << a.second << "\n";
+        }
+    }
+
+    if (arena.m_freelist.empty()) {
+        os << "    No free nodes\n";
+    } else {
+        os << "    List of free nodes: (address, owner, size)\n";
+        for (auto const& a : arena.m_freelist) {
+            os << "        " << a.block() << ", " << a.owner() << ", "
+               << a.size() << "\n";
+        }
+    }
+
+    if (arena.m_busylist.empty()) {
+        os << "    No busy nodes\n";
+    } else {
+        os << "    List of busy nodes: (address, owner, size)\n";
+        for (auto const& a : arena.m_busylist) {
+            os << "        " << a.block() << ", " << a.owner() << ", "
+               << a.size() << "\n";
+        }
+    }
+
+    return os;
+}
+
 }

From b7408ea6e8feca7eab2f7cf30606d066b6699814 Mon Sep 17 00:00:00 2001
From: Edward Basso <edwardebasso@gmail.com>
Date: Mon, 13 Nov 2023 18:56:02 -0800
Subject: [PATCH 3/4] solve_cg: use linop.make instead of MF constructor
 (#3627)

## Summary

This PR replaces the explicit use of `MF` constructors in
`MLCGSolverT<MF>::solve_cg` with calls to the make method of the linear
operator associated with the `MLCGSolverT` object.

## Additional background

This is a similar to the PR on `solve_bicgstab`.
---
 Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
index fce7b1d5005..c99d7b319bd 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCGSolver.H
@@ -261,17 +261,13 @@ MLCGSolverT<MF>::solve_cg (MF& sol, const MF& rhs, RT eps_rel, RT eps_abs)
 
     const int ncomp = sol.nComp();
 
-    const BoxArray& ba = sol.boxArray();
-    const DistributionMapping& dm = sol.DistributionMap();
-    const auto& factory = sol.Factory();
-
-    MF p(ba, dm, ncomp, sol.nGrowVect(), MFInfo(), factory);
+    MF p = Lp.make(amrlev, mglev, sol.nGrowVect());
     p.setVal(RT(0.0));
 
-    MF sorig(ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF r    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF z    (ba, dm, ncomp, nghost, MFInfo(), factory);
-    MF q    (ba, dm, ncomp, nghost, MFInfo(), factory);
+    MF sorig = Lp.make(amrlev, mglev, nghost);
+    MF r     = Lp.make(amrlev, mglev, nghost);
+    MF z     = Lp.make(amrlev, mglev, nghost);
+    MF q     = Lp.make(amrlev, mglev, nghost);
 
     sorig.LocalCopy(sol,0,0,ncomp,nghost);
 

From af1e1be8d7d41de9b999b4973b2024281a28f23e Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Tue, 14 Nov 2023 11:13:47 -0800
Subject: [PATCH 4/4] Plotfile Tools: GPU support (#3626)

## Summary

Add GPU support for fcompare, fextrema, fsnapshot and fvolumesum.

## Checklist

The proposed changes:
- [x] fix a bug or incorrect behavior in AMReX
- [x] add new capabilities to AMReX
- [ ] changes answers in the test suite to more than roundoff level
- [ ] are likely to significantly affect the results of downstream AMReX
users
- [ ] include documentation in the code and/or rst files, if appropriate
---
 Src/Base/AMReX_PlotFileDataImpl.cpp |   2 +-
 Tools/Plotfile/fextrema.cpp         |  35 ++++----
 Tools/Plotfile/fsnapshot.cpp        |   4 +-
 Tools/Plotfile/fvolumesum.cpp       | 130 ++++++++++------------------
 4 files changed, 66 insertions(+), 105 deletions(-)

diff --git a/Src/Base/AMReX_PlotFileDataImpl.cpp b/Src/Base/AMReX_PlotFileDataImpl.cpp
index 1fbf5044a50..b85c17ad93c 100644
--- a/Src/Base/AMReX_PlotFileDataImpl.cpp
+++ b/Src/Base/AMReX_PlotFileDataImpl.cpp
@@ -141,7 +141,7 @@ PlotFileDataImpl::get (int level, std::string const& varname) noexcept
             int gid = mfi.index();
             FArrayBox& dstfab = mf[mfi];
             std::unique_ptr<FArrayBox> srcfab(m_vismf[level]->readFAB(gid, icomp));
-            dstfab.copy<RunOn::Host>(*srcfab);
+            dstfab.copy<RunOn::Device>(*srcfab);
         }
     }
     return mf;
diff --git a/Tools/Plotfile/fextrema.cpp b/Tools/Plotfile/fextrema.cpp
index 44cfaf161d4..55596bf952d 100644
--- a/Tools/Plotfile/fextrema.cpp
+++ b/Tools/Plotfile/fextrema.cpp
@@ -1,5 +1,6 @@
 #include <AMReX.H>
 #include <AMReX_Print.H>
+#include <AMReX_ParReduce.H>
 #include <AMReX_PlotFileUtil.H>
 #include <AMReX_MultiFabUtil.H>
 #include <algorithm>
@@ -80,23 +81,23 @@ void main_main()
                                               pf.boxArray(ilev+1), ratio);
                 for (int ivar = 0; ivar < var_names.size(); ++ivar) {
                     const MultiFab& mf = pf.get(ilev, var_names[ivar]);
-                    for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
-                        const Box& bx = mfi.validbox();
-                        const auto lo = amrex::lbound(bx);
-                        const auto hi = amrex::ubound(bx);
-                        const auto& ifab = mask.array(mfi);
-                        const auto& fab = mf.array(mfi);
-                        for         (int k = lo.z; k <= hi.z; ++k) {
-                            for     (int j = lo.y; j <= hi.y; ++j) {
-                                for (int i = lo.x; i <= hi.x; ++i) {
-                                    if (ifab(i,j,k) == 0) {
-                                        vvmin[ivar] = std::min(fab(i,j,k),vvmin[ivar]);
-                                        vvmax[ivar] = std::max(fab(i,j,k),vvmax[ivar]);
-                                    }
-                                }
-                            }
-                        }
-                    }
+                    auto const& ma = mf.const_arrays();
+                    auto const& ima = mask.const_arrays();
+                    auto rr = ParReduce(TypeList<ReduceOpMin,ReduceOpMax>{},
+                                        TypeList<Real,Real>{}, mf,
+                              [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+                                  -> GpuTuple<Real,Real>
+                              {
+                                  if (ima[bno](i,j,k) == 0) {
+                                      auto x = ma[bno](i,j,k);
+                                      return {x,x};
+                                  } else {
+                                      return {std::numeric_limits<Real>::max(),
+                                              std::numeric_limits<Real>::lowest()};
+                                  }
+                              });
+                    vvmin[ivar] = std::min(amrex::get<0>(rr), vvmin[ivar]);
+                    vvmax[ivar] = std::max(amrex::get<1>(rr), vvmax[ivar]);
                 }
             }
         }
diff --git a/Tools/Plotfile/fsnapshot.cpp b/Tools/Plotfile/fsnapshot.cpp
index e68f8a33b6d..c4b9fd3f361 100644
--- a/Tools/Plotfile/fsnapshot.cpp
+++ b/Tools/Plotfile/fsnapshot.cpp
@@ -278,7 +278,7 @@ void main_main()
         gmx = std::log10(gmx);
     }
 
-    BaseFab<unsigned char> intdat;
+    BaseFab<unsigned char> intdat(The_Pinned_Arena());
     for (int idir = ndir_begin; idir < ndir_end; ++idir) {
         intdat.resize(finebox[idir],1);
         const int width = (idir == 0) ? finebox[idir].length(1) : finebox[idir].length(0);
@@ -286,7 +286,7 @@ void main_main()
         const auto& intarr = intdat.array();
         const auto& realarr = datamf[idir].array(0);
         Real fac = Real(253.999) / (gmx-gmn);
-        amrex::LoopOnCpu(finebox[idir], [=] (int i, int j, int k)
+        amrex::ParallelFor(finebox[idir], [=] AMREX_GPU_DEVICE (int i, int j, int k)
         {
             int jj = (idir == 2) ? height - 1 - j : j;  // flip the data in second image direction
             int kk = (idir == 2) ? k : height - 1 - k;
diff --git a/Tools/Plotfile/fvolumesum.cpp b/Tools/Plotfile/fvolumesum.cpp
index 2f9f03cc522..ec6e461dcc7 100644
--- a/Tools/Plotfile/fvolumesum.cpp
+++ b/Tools/Plotfile/fvolumesum.cpp
@@ -2,6 +2,7 @@
 #include <AMReX_Print.H>
 #include <AMReX_PlotFileUtil.H>
 #include <AMReX_MultiFabUtil.H>
+#include <AMReX_ParReduce.H>
 #include <AMReX_ParallelDescriptor.H>
 #include <limits>
 #include <iterator>
@@ -80,7 +81,6 @@ void main_main()
 
     const int dim = pf.spaceDim();
 
-
     int fine_level = pf.finestLevel();
 
     Vector<Real> pos;
@@ -95,6 +95,35 @@ void main_main()
 
         Array<Real,AMREX_SPACEDIM> dx = pf.cellSize(ilev);
 
+        Real volfac = AMREX_D_TERM(dx[0], *dx[1], *dx[2]);
+
+        if (coord == 1) {
+            AMREX_ALWAYS_ASSERT(AMREX_SPACEDIM == 2);
+            // axisymmetric V = pi (r_r**2 - r_l**2) * dz
+            //                = pi dr * dz * (r_r + r_l)
+            //                = 2 pi r dr dz
+            volfac *= 2 * pi; // 2 * pi * dr * dz part here
+        } else if (coord == 2) {
+            AMREX_ALWAYS_ASSERT(AMREX_SPACEDIM == 1);
+            // 1-d spherical V = 4/3 pi (r_r**3 - r_l**3)
+            volfac *= (4.0_rt/3.0_rt) * pi; // 4/3 * pi * dr part here
+        }
+
+        auto xlo = problo[0];
+        auto dx0 = dx[0];
+        AMREX_ASSERT(coord == 0 || coord == 1 || coord == 2);
+        auto f_vol = [=] AMREX_GPU_DEVICE (int i) {
+                         if (coord == 0) {
+                             return volfac;
+                         } else if (coord == 1) {
+                             return volfac * (xlo + (Real(i)+0.5_rt)*dx0);
+                         } else {
+                             Real r_r = xlo + Real(i+1)*dx0;
+                             Real r_l = xlo + Real(i  )*dx0;
+                             return volfac * (r_r*r_r + r_l*r_r + r_l*r_l);
+                         }
+                     };
+
         if (ilev < fine_level) {
             IntVect ratio{pf.refRatio(ilev)};
             for (int idim = dim; idim < AMREX_SPACEDIM; ++idim) {
@@ -103,97 +132,28 @@ void main_main()
             const iMultiFab mask = makeFineMask(pf.boxArray(ilev), pf.DistributionMap(ilev),
                                                 pf.boxArray(ilev+1), ratio);
             const MultiFab& mf = pf.get(ilev, var_name);
-            for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
-                const Box& bx = mfi.validbox();
-                if (bx.ok()) {
-                    const auto& m = mask.array(mfi);
-                    const auto& fab = mf.array(mfi);
-                    const auto lo = amrex::lbound(bx);
-                    const auto hi = amrex::ubound(bx);
-                    for (int k = lo.z; k <= hi.z; ++k) {
-                        for (int j = lo.y; j <= hi.y; ++j) {
-                            for (int i = lo.x; i <= hi.x; ++i) {
-                                if (m(i,j,k) == 0) { // not covered by fine
-                                    Array<Real,AMREX_SPACEDIM> p
-                                        = {AMREX_D_DECL(problo[0]+static_cast<Real>(i+0.5)*dx[0],
-                                                        problo[1]+static_cast<Real>(j+0.5)*dx[1],
-                                                        problo[2]+static_cast<Real>(k+0.5)*dx[2])};
-
-                                    // compute the volume
-                                    Real vol = std::numeric_limits<Real>::quiet_NaN();
-                                    if (coord == 0) {
-                                        // Cartesian
-                                        vol = 1.0_rt;
-                                        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                                            vol *= dx[idim];
-                                        }
-                                    } else if (coord == 1) {
-                                        // axisymmetric V = pi (r_r**2 - r_l**2) * dz
-                                        //                = pi dr * dz * (r_r + r_l)
-                                        //                = 2 pi r dr dz
-                                        vol = 2 * pi * p[0] * dx[0] * dx[1];
-                                    } else if (coord == 2) {
-                                        // 1-d spherical V = 4/3 pi (r_r**3 - r_l**3)
-                                        Real r_r = problo[0]+static_cast<Real>(i+1)*dx[0];
-                                        Real r_l = problo[0]+static_cast<Real>(i)*dx[0];
-                                        vol = (4.0_rt/3.0_rt) * pi * dx[0] * (r_r*r_r + r_l*r_r + r_l*r_l);
-                                    }
-
-                                    lsum += fab(i,j,k) * vol;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
+            auto const& ima = mask.const_arrays();
+            auto const& ma = mf.const_arrays();
+            lsum += ParReduce(TypeList<ReduceOpSum>{}, TypeList<Real>{}, mf,
+                    [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+                        -> GpuTuple<Real>
+                    {
+                        return { (ima[bno](i,j,k) == 0) ? ma[bno](i,j,k)*f_vol(i) : 0._rt };
+                    });
         } else {
             const MultiFab& mf = pf.get(ilev, var_name);
-            for (MFIter mfi(mf); mfi.isValid(); ++mfi) {
-                const Box& bx = mfi.validbox();
-                if (bx.ok()) {
-                    const auto& fab = mf.array(mfi);
-                    const auto lo = amrex::lbound(bx);
-                    const auto hi = amrex::ubound(bx);
-                    for (int k = lo.z; k <= hi.z; ++k) {
-                        for (int j = lo.y; j <= hi.y; ++j) {
-                            for (int i = lo.x; i <= hi.x; ++i) {
-                                Array<Real,AMREX_SPACEDIM> p
-                                    = {AMREX_D_DECL(problo[0]+static_cast<Real>(i+0.5)*dx[0],
-                                                    problo[1]+static_cast<Real>(j+0.5)*dx[1],
-                                                    problo[2]+static_cast<Real>(k+0.5)*dx[2])};
-
-                                    // compute the volume
-                                    Real vol = std::numeric_limits<Real>::quiet_NaN();
-                                    if (coord == 0) {
-                                        // Cartesian
-                                        vol = 1.0_rt;
-                                        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
-                                            vol *= dx[idim];
-                                        }
-                                    } else if (coord == 1) {
-                                        // axisymmetric V = pi (r_r**2 - r_l**2) * dz
-                                        //                = pi dr * dz * (r_r + r_l)
-                                        //                = 2 pi r dr dz
-                                        vol = 2 * pi * p[0] * dx[0] * dx[1];
-                                    } else if (coord == 2) {
-                                        // 1-d spherical V = 4/3 pi (r_r**3 - r_l**3)
-                                        Real r_r = problo[0]+static_cast<Real>(i+1)*dx[0];
-                                        Real r_l = problo[0]+static_cast<Real>(i)*dx[0];
-                                        vol = (4.0_rt/3.0_rt) * pi * dx[0] * (r_r*r_r + r_l*r_r + r_l*r_l);
-                                    }
-
-                                    lsum += fab(i,j,k) * vol;
-                            }
-                        }
-                    }
-                }
-            }
+            auto const& ma = mf.const_arrays();
+            lsum += ParReduce(TypeList<ReduceOpSum>{}, TypeList<Real>{}, mf,
+                    [=] AMREX_GPU_DEVICE (int bno, int i, int j, int k)
+                        -> GpuTuple<Real>
+                    {
+                        return { ma[bno](i,j,k)*f_vol(i) };
+                    });
         }
     }
 
     ParallelDescriptor::ReduceRealSum(lsum);
 
-
     if (ParallelDescriptor::IOProcessor()) {
         std::cout << "integral of " << var_name << " = " << lsum << std::endl;