From 1be7257928eb5b553905a482e3a3d608272d22f0 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Fri, 29 Mar 2024 10:03:10 -0700
Subject: [PATCH] FabArray: Option to use a single contiguous chunk of memory
 (#3857)

This adds an option to use a single contiguous chunk of memory for all
the data in Fabs of a FabArray/MultiFab/iMultiFab. One can change the
strategy for an individual MultiFab via
MFInfo::SetAllocSingleChunk(bool) and for all MultiFabs by default via
ParmParse parameter, amrex.mf.alloc_single_chunk=1.

This is considered an experimental feature. Please let us know if you
notice any issues.
---
 Docs/sphinx_documentation/source/Basics.rst | 24 +++++++--
 Src/Base/AMReX_FArrayBox.cpp                |  2 +-
 Src/Base/AMReX_FabArray.H                   | 56 +++++++++++++++++++--
 Src/Base/AMReX_FabArrayBase.H               | 39 ++++++++++++++
 Src/Base/AMReX_FabArrayBase.cpp             | 53 +++++++++++++++++++
 Src/Base/AMReX_FabFactory.H                 | 11 +++-
 Src/EB/AMReX_MultiCutFab.H                  |  2 -
 Src/EB/AMReX_MultiCutFab.cpp                | 56 +++++++++++++++------
 8 files changed, 215 insertions(+), 28 deletions(-)
diff --git a/Docs/sphinx_documentation/source/Basics.rst b/Docs/sphinx_documentation/source/Basics.rst
index 8c5fe3dbba5..406d4981c8b 100644
--- a/Docs/sphinx_documentation/source/Basics.rst
+++ b/Docs/sphinx_documentation/source/Basics.rst
@@ -2735,10 +2735,26 @@ covered by fine level grids.
 Memory Allocation
 =================
 
-Some constructors of :cpp:`MultiFab`, :cpp:`FArrayBox`, etc. can take
-an :cpp:`Arena` argument for memory allocation.  This is usually not
-important for CPU codes, but very important for GPU codes.  We will
-present more details in :ref:`sec:gpu:memory` in Chapter GPU.
+Some constructors of :cpp:`MultiFab`, :cpp:`FArrayBox`, etc. can take an
+:cpp:`Arena` argument for memory allocation.  Some constructors of
+:cpp:`MultiFab` can take an optional argument :cpp:`MFInfo`, which can be
+used to set the arena.  This is usually not important for CPU codes, but
+very important for GPU codes.  We will present more details about memory
+arenas in :ref:`sec:gpu:memory` in Chapter GPU.
+
+Every :cpp:`FArrayBox` in a :cpp:`MultiFab` has a contiguous chunk of memory
+for floating point data, whereas by default :cpp:`MultiFab` as a collection
+of multiple :cpp:`FArrayBox`\ s does not store all floating point data in
+contiguous chunk of memory. This behavior can be changed for all
+:cpp:`MultiFab`\ s with the :cpp:`ParmParse` parameter,
+``amrex.mf.alloc_single_chunk=1``, or for a specific :cpp:`MultiFab` by
+passing a :cpp:`MFInfo` object (e.g.,
+``MFInfo().SetAllocSingleChunk(true)``) to the constructor. One can call
+:cpp:`MultiFab::singleChunkPtr()` to obtain a pointer to the single chunk
+memory. Note that the function returns a null pointer if the :cpp:`MultiFab`
+does not use a single contiguous chunk of memory. One can also call
+:cpp:`MultiFab::singleChunkSize()` to obtain the size in bytes of the single
+chunk memory.
 
 AMReX has a Fortran module, :fortran:`amrex_mempool_module` that can be used to
 allocate memory for Fortran pointers. The reason that such a module exists in
diff --git a/Src/Base/AMReX_FArrayBox.cpp b/Src/Base/AMReX_FArrayBox.cpp
index 6efd90f97e9..ecb7fc0f4fc 100644
--- a/Src/Base/AMReX_FArrayBox.cpp
+++ b/Src/Base/AMReX_FArrayBox.cpp
@@ -809,7 +809,7 @@ FABio_8bit::write (std::ostream&    os,
         const Real mn   = f.min<RunOn::Host>(k+comp);
         const Real mx   = f.max<RunOn::Host>(k+comp);
         const Real* dat = f.dataPtr(k+comp);
-        Real rng = std::fabs(mx-mn);
+        Real rng = std::abs(mx-mn);
         rng = (rng < eps) ? 0.0_rt : 255.0_rt/(mx-mn);
         for(Long i(0); i < siz; ++i) {
             Real v = rng*(dat[i]-mn);
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
index 38bdd212a4b..426021d6272 100644
--- a/Src/Base/AMReX_FabArray.H
+++ b/Src/Base/AMReX_FabArray.H
@@ -65,11 +65,14 @@ Long nBytesOwned (BaseFab<T> const& fab) noexcept { return fab.nBytesOwned(); }
 struct MFInfo {
     // alloc: allocate memory or not
     bool    alloc = true;
+    bool    alloc_single_chunk = FabArrayBase::getAllocSingleChunk();
     Arena*  arena = nullptr;
     Vector<std::string> tags;
 
     MFInfo& SetAlloc (bool a) noexcept { alloc = a; return *this; }
 
+    MFInfo& SetAllocSingleChunk (bool a) noexcept { alloc_single_chunk = a; return *this; }
+
     MFInfo& SetArena (Arena* ar) noexcept { arena = ar; return *this; }
 
     MFInfo& SetTag () noexcept { return *this; }
@@ -436,6 +439,22 @@ public:
 #endif
     }
 
+    //! Return the data pointer to the single chunk memory if this object
+    //! uses a single contiguous chunk of memory, nullptr otherwise.
+    [[nodiscard]] value_type* singleChunkPtr () noexcept {
+        return m_single_chunk_arena ? (value_type*)m_single_chunk_arena->data() : nullptr;
+    }
+
+    //! Return the data pointer to the single chunk memory if this object
+    //! uses a single contiguous chunk of memory, nullptr otherwise.
+    [[nodiscard]] value_type const* singleChunkPtr () const noexcept {
+        return m_single_chunk_arena ? (value_type const*)m_single_chunk_arena->data() : nullptr;
+    }
+
+    //! Return the size of the single chunk memory if this object uses a
+    //! single contiguous chunk of memory, 0 otherwise.
+    [[nodiscard]] std::size_t singleChunkSize () const noexcept { return m_single_chunk_size; }
+
     bool isAllRegular () const noexcept {
 #ifdef AMREX_USE_EB
         const auto *const f = dynamic_cast<EBFArrayBoxFactory const*>(m_factory.get());
@@ -1233,6 +1252,8 @@ protected:
 
     std::unique_ptr<FabFactory<FAB> > m_factory;
     DataAllocator m_dallocator;
+    std::unique_ptr<detail::SingleChunkArena> m_single_chunk_arena;
+    Long m_single_chunk_size = 0;
 
     //! has define() been called?
     bool define_function_called = false;
@@ -1306,7 +1327,8 @@ private:
     using Iterator = typename std::vector<FAB*>::iterator;
 
     void AllocFabs (const FabFactory<FAB>& factory, Arena* ar,
-                    const Vector<std::string>& tags);
+                    const Vector<std::string>& tags,
+                    bool alloc_single_chunk);
 
     void setFab_assert (int K, FAB const& fab) const;
 
@@ -1696,6 +1718,7 @@ FabArray<FAB>::release (int K)
 {
     const int li = localindex(K);
     if (li >= 0 && li < static_cast<int>(m_fabs_v.size()) && m_fabs_v[li] != nullptr) {
+        AMREX_ASSERT(m_single_chunk_arena == nullptr);
         Long nbytes = amrex::nBytesOwned(*m_fabs_v[li]);
         if (nbytes > 0) {
             for (auto const& t : m_tags) {
@@ -1715,6 +1738,7 @@ FabArray<FAB>::release (const MFIter& mfi)
 {
     const int li = mfi.LocalIndex();
     if (li >= 0 && li < static_cast<int>(m_fabs_v.size()) && m_fabs_v[li] != nullptr) {
+        AMREX_ASSERT(m_single_chunk_arena == nullptr);
         Long nbytes = amrex::nBytesOwned(*m_fabs_v[li]);
         if (nbytes > 0) {
             for (auto const& t : m_tags) {
@@ -1755,6 +1779,12 @@ FabArray<FAB>::clear ()
             updateMemUsage(t, -nbytes, nullptr);
         }
     }
+
+    if (m_single_chunk_arena) {
+        m_single_chunk_arena.reset();
+    }
+    m_single_chunk_size = 0;
+
     m_tags.clear();
 
     FabArrayBase::clear();
@@ -1880,6 +1910,8 @@ FabArray<FAB>::FabArray (FabArray<FAB>&& rhs) noexcept
     : FabArrayBase (static_cast<FabArrayBase&&>(rhs))
     , m_factory    (std::move(rhs.m_factory))
     , m_dallocator (std::move(rhs.m_dallocator))
+    , m_single_chunk_arena(std::move(rhs.m_single_chunk_arena))
+    , m_single_chunk_size(std::exchange(rhs.m_single_chunk_size,0))
     , define_function_called(rhs.define_function_called)
     , m_fabs_v     (std::move(rhs.m_fabs_v))
 #ifdef AMREX_USE_GPU
@@ -1909,6 +1941,8 @@ FabArray<FAB>::operator= (FabArray<FAB>&& rhs) noexcept
         FabArrayBase::operator=(static_cast<FabArrayBase&&>(rhs));
         m_factory = std::move(rhs.m_factory);
         m_dallocator = std::move(rhs.m_dallocator);
+        m_single_chunk_arena = std::move(rhs.m_single_chunk_arena);
+        std::swap(m_single_chunk_size, rhs.m_single_chunk_size);
         define_function_called = rhs.define_function_called;
         std::swap(m_fabs_v, rhs.m_fabs_v);
 #ifdef AMREX_USE_GPU
@@ -2008,7 +2042,7 @@ FabArray<FAB>::define (const BoxArray&            bxs,
     addThisBD();
 
     if(info.alloc) {
-        AllocFabs(*m_factory, m_dallocator.m_arena, info.tags);
+        AllocFabs(*m_factory, m_dallocator.m_arena, info.tags, info.alloc_single_chunk);
 #ifdef BL_USE_TEAM
         ParallelDescriptor::MyTeam().MemoryBarrier();
 #endif
@@ -2018,8 +2052,11 @@ FabArray<FAB>::define (const BoxArray&            bxs,
 template <class FAB>
 void
 FabArray<FAB>::AllocFabs (const FabFactory<FAB>& factory, Arena* ar,
-                          const Vector<std::string>& tags)
+                          const Vector<std::string>& tags, bool alloc_single_chunk)
 {
+    if (shmem.alloc) { alloc_single_chunk = false; }
+    if constexpr (!IsBaseFab_v<FAB>) { alloc_single_chunk = false; }
+
     const int n = indexArray.size();
     const int nworkers = ParallelDescriptor::TeamSize();
     shmem.alloc = (nworkers > 1);
@@ -2029,6 +2066,18 @@ FabArray<FAB>::AllocFabs (const FabFactory<FAB>& factory, Arena* ar,
     FabInfo fab_info;
     fab_info.SetAlloc(alloc).SetShared(shmem.alloc).SetArena(ar);
 
+    if (alloc_single_chunk) {
+        m_single_chunk_size = 0L;
+        for (int i = 0; i < n; ++i) {
+            int K = indexArray[i];
+            const Box& tmpbox = fabbox(K);
+            m_single_chunk_size += factory.nBytes(tmpbox, n_comp, K);
+        }
+        AMREX_ASSERT(m_single_chunk_size >= 0); // 0 is okay.
+        m_single_chunk_arena = std::make_unique<detail::SingleChunkArena>(ar, m_single_chunk_size);
+        fab_info.SetArena(m_single_chunk_arena.get());
+    }
+
     m_fabs_v.reserve(n);
 
     Long nbytes = 0L;
@@ -2136,6 +2185,7 @@ FabArray<FAB>::setFab_assert (int K, FAB const& fab) const
     AMREX_ASSERT(!boxarray.empty());
     AMREX_ASSERT(fab.box() == fabbox(K));
     AMREX_ASSERT(distributionMap[K] == ParallelDescriptor::MyProc());
+    AMREX_ASSERT(m_single_chunk_arena == nullptr);
 }
 
 template <class FAB>
diff --git a/Src/Base/AMReX_FabArrayBase.H b/Src/Base/AMReX_FabArrayBase.H
index 09dfd5e22db..44d0d9c269c 100644
--- a/Src/Base/AMReX_FabArrayBase.H
+++ b/Src/Base/AMReX_FabArrayBase.H
@@ -3,6 +3,7 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_BoxArray.H>
+#include <AMReX_DataAllocator.H>
 #include <AMReX_DistributionMapping.H>
 #include <AMReX_ParallelDescriptor.H>
 #include <AMReX_ParallelReduce.H>
@@ -721,8 +722,46 @@ public:
     };
     static AMREX_EXPORT FabArrayStats m_FA_stats;
 
+    static AMREX_EXPORT bool m_alloc_single_chunk;
+
+    [[nodiscard]] static bool getAllocSingleChunk () { return m_alloc_single_chunk; }
 };
 
+namespace detail {
+    class SingleChunkArena final
+        : public Arena
+    {
+    public:
+        SingleChunkArena (Arena* a_arena, std::size_t a_size);
+        ~SingleChunkArena () override;
+
+        SingleChunkArena () = delete;
+        SingleChunkArena (const SingleChunkArena& rhs) = delete;
+        SingleChunkArena (SingleChunkArena&& rhs) = delete;
+        SingleChunkArena& operator= (const SingleChunkArena& rhs) = delete;
+        SingleChunkArena& operator= (SingleChunkArena&& rhs) = delete;
+
+        [[nodiscard]] void* alloc (std::size_t sz) override;
+        void free (void* pt) override;
+
+        // isDeviceAccessible and isHostAccessible can both be true.
+        [[nodiscard]] bool isDeviceAccessible () const override;
+        [[nodiscard]] bool isHostAccessible () const override;
+
+        [[nodiscard]] bool isManaged () const override;
+        [[nodiscard]] bool isDevice () const override;
+        [[nodiscard]] bool isPinned () const override;
+
+        [[nodiscard]] void* data () const noexcept { return (void*) m_root; }
+
+    private:
+        DataAllocator m_dallocator;
+        char* m_root = nullptr;
+        char* m_free = nullptr;
+        std::size_t m_size = 0;
+    };
+}
+
 [[nodiscard]] int nComp (FabArrayBase const& fa);
 [[nodiscard]] IntVect nGrowVect (FabArrayBase const& fa);
 [[nodiscard]] BoxArray const& boxArray (FabArrayBase const& fa);
diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp
index eb8fc99605b..c5168fb2ceb 100644
--- a/Src/Base/AMReX_FabArrayBase.cpp
+++ b/Src/Base/AMReX_FabArrayBase.cpp
@@ -86,6 +86,8 @@ FabArrayBase::FabArrayStats        FabArrayBase::m_FA_stats;
 std::map<std::string,FabArrayBase::meminfo> FabArrayBase::m_mem_usage;
 std::vector<std::string>                    FabArrayBase::m_region_tag;
 
+bool                               FabArrayBase::m_alloc_single_chunk = false;
+
 namespace
 {
     bool initialized = false;
@@ -122,6 +124,9 @@ FabArrayBase::Initialize ()
         MaxComp = 1;
     }
 
+    ParmParse ppmf("amrex.mf");
+    ppmf.queryAdd("alloc_single_chunk", FabArrayBase::m_alloc_single_chunk);
+
     amrex::ExecOnFinalize(FabArrayBase::Finalize);
 
 #ifdef AMREX_MEM_PROFILING
@@ -2696,6 +2701,54 @@ FabArrayBase::flushParForCache ()
 
 #endif
 
+namespace detail {
+
+    SingleChunkArena::SingleChunkArena (Arena* a_arena, std::size_t a_size)
+        : m_dallocator(a_arena),
+          m_root((char*)m_dallocator.alloc(a_size)),
+          m_free(m_root),
+          m_size(a_size)
+    {}
+
+    SingleChunkArena::~SingleChunkArena ()
+    {
+        if (m_root) {
+            m_dallocator.free(m_root);
+        }
+    }
+
+    void* SingleChunkArena::alloc (std::size_t sz)
+    {
+        amrex::ignore_unused(m_size);
+        auto* p = (void*)m_free;
+        AMREX_ASSERT(sz <= m_size && ((m_free-m_root)+sz <= m_size));
+        m_free += sz;
+        return p;
+    }
+
+    void SingleChunkArena::free (void* /*pt*/) {}
+
+    bool SingleChunkArena::isDeviceAccessible () const {
+        return m_dallocator.arena()->isDeviceAccessible();
+    }
+
+    bool SingleChunkArena::isHostAccessible () const {
+        return m_dallocator.arena()->isHostAccessible();
+    }
+
+    bool SingleChunkArena::isManaged () const {
+        return m_dallocator.arena()->isManaged();
+    }
+
+    bool SingleChunkArena::isDevice () const {
+        return m_dallocator.arena()->isDevice();
+    }
+
+    bool SingleChunkArena::isPinned () const {
+        return m_dallocator.arena()->isPinned();
+    }
+}
+
 int nComp (FabArrayBase const& fa)
 {
     return fa.nComp();
diff --git a/Src/Base/AMReX_FabFactory.H b/Src/Base/AMReX_FabFactory.H
index 01ee085f32f..7f65c21ce2b 100644
--- a/Src/Base/AMReX_FabFactory.H
+++ b/Src/Base/AMReX_FabFactory.H
@@ -8,6 +8,7 @@
 #include <AMReX_MakeType.H>
 #include <AMReX_Vector.H>
 #include <AMReX_Arena.H>
+#include <AMReX_TypeTraits.H>
 
 namespace amrex
 {
@@ -59,8 +60,14 @@ public:
     AMREX_NODISCARD
     virtual FAB* create_alias (FAB const& /*rhs*/, int /*scomp*/, int /*ncomp*/) const { return nullptr; }
     virtual void destroy (FAB* fab) const = 0;
-    AMREX_NODISCARD
-    virtual FabFactory<FAB>* clone () const = 0;
+    AMREX_NODISCARD virtual FabFactory<FAB>* clone () const = 0;
+    AMREX_NODISCARD virtual Long nBytes (const Box& box, int ncomps, int /*box_index*/) const {
+        if constexpr (IsBaseFab_v<FAB>) {
+            return box.numPts() * ncomps * Long(sizeof(typename FAB::value_type));
+        } else {
+            return -1;
+        }
+    }
 };
 
 template <class FAB>
diff --git a/Src/EB/AMReX_MultiCutFab.H b/Src/EB/AMReX_MultiCutFab.H
index a7042b8e129..3548cd79fe4 100644
--- a/Src/EB/AMReX_MultiCutFab.H
+++ b/Src/EB/AMReX_MultiCutFab.H
@@ -144,8 +144,6 @@ private:
 
     FabArray<CutFab> m_data;
     const FabArray<EBCellFlagFab>* m_cellflags = nullptr;
-
-    void remove ();
 };
 
 }
diff --git a/Src/EB/AMReX_MultiCutFab.cpp b/Src/EB/AMReX_MultiCutFab.cpp
index 5bcbed1cfbe..b3987cca102 100644
--- a/Src/EB/AMReX_MultiCutFab.cpp
+++ b/Src/EB/AMReX_MultiCutFab.cpp
@@ -8,14 +8,51 @@
 
 namespace amrex {
 
+namespace detail {
+    class CutFabFactory final
+        : public DefaultFabFactory<CutFab>
+    {
+    public:
+        CutFabFactory (const FabArray<EBCellFlagFab>* a_cellflags)
+            : m_cellflags(a_cellflags)
+            {}
+
+        AMREX_NODISCARD
+        CutFab* create (const Box& box, int ncomps, const FabInfo& info, int box_index) const override
+        {
+            if ((*m_cellflags)[box_index].getType() == FabType::singlevalued) {
+                return new CutFab(box, ncomps, info.alloc, info.shared, info.arena);
+            } else {
+                return new CutFab();
+            }
+        }
+
+        AMREX_NODISCARD
+        CutFabFactory* clone () const override { return new CutFabFactory(m_cellflags); }
+
+        AMREX_NODISCARD
+        Long nBytes (const Box& box, int ncomps, int box_index) const override
+        {
+            if ((*m_cellflags)[box_index].getType() == FabType::singlevalued) {
+                return box.numPts() * ncomps * Long(sizeof(Real));
+            } else {
+                return Long(0);
+            }
+        }
+
+    private:
+        const FabArray<EBCellFlagFab>* m_cellflags = nullptr;
+    };
+}
+
+
 MultiCutFab::MultiCutFab () = default;
 
 MultiCutFab::MultiCutFab (const BoxArray& ba, const DistributionMapping& dm,
                           int ncomp, int ngrow, const FabArray<EBCellFlagFab>& cellflags)
-    : m_data(ba,dm,ncomp,ngrow,MFInfo(),DefaultFabFactory<CutFab>()),
+    : m_data(ba,dm,ncomp,ngrow,MFInfo(),detail::CutFabFactory(&cellflags)),
       m_cellflags(&cellflags)
 {
-    remove();
 }
 
 MultiCutFab::~MultiCutFab () = default;
@@ -24,21 +61,8 @@ void
 MultiCutFab::define (const BoxArray& ba, const DistributionMapping& dm,
                      int ncomp, int ngrow, const FabArray<EBCellFlagFab>& cellflags)
 {
-    m_data.define(ba,dm,ncomp,ngrow,MFInfo(),DefaultFabFactory<CutFab>()),
+    m_data.define(ba,dm,ncomp,ngrow,MFInfo(),detail::CutFabFactory(&cellflags)),
     m_cellflags = &cellflags;
-    remove();
-}
-
-void
-MultiCutFab::remove ()
-{
-    for (MFIter mfi(m_data); mfi.isValid(); ++mfi)
-    {
-        if (!ok(mfi))
-        {
-            delete m_data.release(mfi);
-        }
-    }
 }
 
 const CutFab&