From 1be7257928eb5b553905a482e3a3d608272d22f0 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Fri, 29 Mar 2024 10:03:10 -0700 Subject: [PATCH] FabArray: Option to use a single contiguous chunk of memory (#3857) This adds an option to use a single contiguous chunk of memory for all the data in Fabs of a FabArray/MultiFab/iMultiFab. One can change the strategy for an individual MultiFab via MFInfo::SetAllocSingleChunk(bool) and for all MultiFabs by default via ParmParse parameter, amrex.mf.alloc_single_chunk=1. This is considered an experimental feature. Please let us know if you notice any issues. --- Docs/sphinx_documentation/source/Basics.rst | 24 +++++++-- Src/Base/AMReX_FArrayBox.cpp | 2 +- Src/Base/AMReX_FabArray.H | 56 +++++++++++++++++++-- Src/Base/AMReX_FabArrayBase.H | 39 ++++++++++++++ Src/Base/AMReX_FabArrayBase.cpp | 53 +++++++++++++++++++ Src/Base/AMReX_FabFactory.H | 11 +++- Src/EB/AMReX_MultiCutFab.H | 2 - Src/EB/AMReX_MultiCutFab.cpp | 56 +++++++++++++++------ 8 files changed, 215 insertions(+), 28 deletions(-) diff --git a/Docs/sphinx_documentation/source/Basics.rst b/Docs/sphinx_documentation/source/Basics.rst index 8c5fe3dbba5..406d4981c8b 100644 --- a/Docs/sphinx_documentation/source/Basics.rst +++ b/Docs/sphinx_documentation/source/Basics.rst @@ -2735,10 +2735,26 @@ covered by fine level grids. Memory Allocation ================= -Some constructors of :cpp:`MultiFab`, :cpp:`FArrayBox`, etc. can take -an :cpp:`Arena` argument for memory allocation. This is usually not -important for CPU codes, but very important for GPU codes. We will -present more details in :ref:`sec:gpu:memory` in Chapter GPU. +Some constructors of :cpp:`MultiFab`, :cpp:`FArrayBox`, etc. can take an +:cpp:`Arena` argument for memory allocation. Some constructors of +:cpp:`MultiFab` can take an optional argument :cpp:`MFInfo`, which can be +used to set the arena. This is usually not important for CPU codes, but +very important for GPU codes. We will present more details about memory +arenas in :ref:`sec:gpu:memory` in Chapter GPU. + +Every :cpp:`FArrayBox` in a :cpp:`MultiFab` has a contiguous chunk of memory +for floating point data, whereas by default :cpp:`MultiFab` as a collection +of multiple :cpp:`FArrayBox`\ s does not store all floating point data in +contiguous chunk of memory. This behavior can be changed for all +:cpp:`MultiFab`\ s with the :cpp:`ParmParse` parameter, +``amrex.mf.alloc_single_chunk=1``, or for a specific :cpp:`MultiFab` by +passing a :cpp:`MFInfo` object (e.g., +``MFInfo().SetAllocSingleChunk(true)``) to the constructor. One can call +:cpp:`MultiFab::singleChunkPtr()` to obtain a pointer to the single chunk +memory. Note that the function returns a null pointer if the :cpp:`MultiFab` +does not use a single contiguous chunk of memory. One can also call +:cpp:`MultiFab::singleChunkSize()` to obtain the size in bytes of the single +chunk memory. AMReX has a Fortran module, :fortran:`amrex_mempool_module` that can be used to allocate memory for Fortran pointers. The reason that such a module exists in diff --git a/Src/Base/AMReX_FArrayBox.cpp b/Src/Base/AMReX_FArrayBox.cpp index 6efd90f97e9..ecb7fc0f4fc 100644 --- a/Src/Base/AMReX_FArrayBox.cpp +++ b/Src/Base/AMReX_FArrayBox.cpp @@ -809,7 +809,7 @@ FABio_8bit::write (std::ostream& os, const Real mn = f.min(k+comp); const Real mx = f.max(k+comp); const Real* dat = f.dataPtr(k+comp); - Real rng = std::fabs(mx-mn); + Real rng = std::abs(mx-mn); rng = (rng < eps) ? 0.0_rt : 255.0_rt/(mx-mn); for(Long i(0); i < siz; ++i) { Real v = rng*(dat[i]-mn); diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H index 38bdd212a4b..426021d6272 100644 --- a/Src/Base/AMReX_FabArray.H +++ b/Src/Base/AMReX_FabArray.H @@ -65,11 +65,14 @@ Long nBytesOwned (BaseFab const& fab) noexcept { return fab.nBytesOwned(); } struct MFInfo { // alloc: allocate memory or not bool alloc = true; + bool alloc_single_chunk = FabArrayBase::getAllocSingleChunk(); Arena* arena = nullptr; Vector tags; MFInfo& SetAlloc (bool a) noexcept { alloc = a; return *this; } + MFInfo& SetAllocSingleChunk (bool a) noexcept { alloc_single_chunk = a; return *this; } + MFInfo& SetArena (Arena* ar) noexcept { arena = ar; return *this; } MFInfo& SetTag () noexcept { return *this; } @@ -436,6 +439,22 @@ public: #endif } + //! Return the data pointer to the single chunk memory if this object + //! uses a single contiguous chunk of memory, nullptr otherwise. + [[nodiscard]] value_type* singleChunkPtr () noexcept { + return m_single_chunk_arena ? (value_type*)m_single_chunk_arena->data() : nullptr; + } + + //! Return the data pointer to the single chunk memory if this object + //! uses a single contiguous chunk of memory, nullptr otherwise. + [[nodiscard]] value_type const* singleChunkPtr () const noexcept { + return m_single_chunk_arena ? (value_type const*)m_single_chunk_arena->data() : nullptr; + } + + //! Return the size of the single chunk memory if this object uses a + //! single contiguous chunk of memory, 0 otherwise. + [[nodiscard]] std::size_t singleChunkSize () const noexcept { return m_single_chunk_size; } + bool isAllRegular () const noexcept { #ifdef AMREX_USE_EB const auto *const f = dynamic_cast(m_factory.get()); @@ -1233,6 +1252,8 @@ protected: std::unique_ptr > m_factory; DataAllocator m_dallocator; + std::unique_ptr m_single_chunk_arena; + Long m_single_chunk_size = 0; //! has define() been called? bool define_function_called = false; @@ -1306,7 +1327,8 @@ private: using Iterator = typename std::vector::iterator; void AllocFabs (const FabFactory& factory, Arena* ar, - const Vector& tags); + const Vector& tags, + bool alloc_single_chunk); void setFab_assert (int K, FAB const& fab) const; @@ -1696,6 +1718,7 @@ FabArray::release (int K) { const int li = localindex(K); if (li >= 0 && li < static_cast(m_fabs_v.size()) && m_fabs_v[li] != nullptr) { + AMREX_ASSERT(m_single_chunk_arena == nullptr); Long nbytes = amrex::nBytesOwned(*m_fabs_v[li]); if (nbytes > 0) { for (auto const& t : m_tags) { @@ -1715,6 +1738,7 @@ FabArray::release (const MFIter& mfi) { const int li = mfi.LocalIndex(); if (li >= 0 && li < static_cast(m_fabs_v.size()) && m_fabs_v[li] != nullptr) { + AMREX_ASSERT(m_single_chunk_arena == nullptr); Long nbytes = amrex::nBytesOwned(*m_fabs_v[li]); if (nbytes > 0) { for (auto const& t : m_tags) { @@ -1755,6 +1779,12 @@ FabArray::clear () updateMemUsage(t, -nbytes, nullptr); } } + + if (m_single_chunk_arena) { + m_single_chunk_arena.reset(); + } + m_single_chunk_size = 0; + m_tags.clear(); FabArrayBase::clear(); @@ -1880,6 +1910,8 @@ FabArray::FabArray (FabArray&& rhs) noexcept : FabArrayBase (static_cast(rhs)) , m_factory (std::move(rhs.m_factory)) , m_dallocator (std::move(rhs.m_dallocator)) + , m_single_chunk_arena(std::move(rhs.m_single_chunk_arena)) + , m_single_chunk_size(std::exchange(rhs.m_single_chunk_size,0)) , define_function_called(rhs.define_function_called) , m_fabs_v (std::move(rhs.m_fabs_v)) #ifdef AMREX_USE_GPU @@ -1909,6 +1941,8 @@ FabArray::operator= (FabArray&& rhs) noexcept FabArrayBase::operator=(static_cast(rhs)); m_factory = std::move(rhs.m_factory); m_dallocator = std::move(rhs.m_dallocator); + m_single_chunk_arena = std::move(rhs.m_single_chunk_arena); + std::swap(m_single_chunk_size, rhs.m_single_chunk_size); define_function_called = rhs.define_function_called; std::swap(m_fabs_v, rhs.m_fabs_v); #ifdef AMREX_USE_GPU @@ -2008,7 +2042,7 @@ FabArray::define (const BoxArray& bxs, addThisBD(); if(info.alloc) { - AllocFabs(*m_factory, m_dallocator.m_arena, info.tags); + AllocFabs(*m_factory, m_dallocator.m_arena, info.tags, info.alloc_single_chunk); #ifdef BL_USE_TEAM ParallelDescriptor::MyTeam().MemoryBarrier(); #endif @@ -2018,8 +2052,11 @@ FabArray::define (const BoxArray& bxs, template void FabArray::AllocFabs (const FabFactory& factory, Arena* ar, - const Vector& tags) + const Vector& tags, bool alloc_single_chunk) { + if (shmem.alloc) { alloc_single_chunk = false; } + if constexpr (!IsBaseFab_v) { alloc_single_chunk = false; } + const int n = indexArray.size(); const int nworkers = ParallelDescriptor::TeamSize(); shmem.alloc = (nworkers > 1); @@ -2029,6 +2066,18 @@ FabArray::AllocFabs (const FabFactory& factory, Arena* ar, FabInfo fab_info; fab_info.SetAlloc(alloc).SetShared(shmem.alloc).SetArena(ar); + if (alloc_single_chunk) { + m_single_chunk_size = 0L; + for (int i = 0; i < n; ++i) { + int K = indexArray[i]; + const Box& tmpbox = fabbox(K); + m_single_chunk_size += factory.nBytes(tmpbox, n_comp, K); + } + AMREX_ASSERT(m_single_chunk_size >= 0); // 0 is okay. + m_single_chunk_arena = std::make_unique(ar, m_single_chunk_size); + fab_info.SetArena(m_single_chunk_arena.get()); + } + m_fabs_v.reserve(n); Long nbytes = 0L; @@ -2136,6 +2185,7 @@ FabArray::setFab_assert (int K, FAB const& fab) const AMREX_ASSERT(!boxarray.empty()); AMREX_ASSERT(fab.box() == fabbox(K)); AMREX_ASSERT(distributionMap[K] == ParallelDescriptor::MyProc()); + AMREX_ASSERT(m_single_chunk_arena == nullptr); } template diff --git a/Src/Base/AMReX_FabArrayBase.H b/Src/Base/AMReX_FabArrayBase.H index 09dfd5e22db..44d0d9c269c 100644 --- a/Src/Base/AMReX_FabArrayBase.H +++ b/Src/Base/AMReX_FabArrayBase.H @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -721,8 +722,46 @@ public: }; static AMREX_EXPORT FabArrayStats m_FA_stats; + static AMREX_EXPORT bool m_alloc_single_chunk; + + [[nodiscard]] static bool getAllocSingleChunk () { return m_alloc_single_chunk; } }; +namespace detail { + class SingleChunkArena final + : public Arena + { + public: + SingleChunkArena (Arena* a_arena, std::size_t a_size); + ~SingleChunkArena () override; + + SingleChunkArena () = delete; + SingleChunkArena (const SingleChunkArena& rhs) = delete; + SingleChunkArena (SingleChunkArena&& rhs) = delete; + SingleChunkArena& operator= (const SingleChunkArena& rhs) = delete; + SingleChunkArena& operator= (SingleChunkArena&& rhs) = delete; + + [[nodiscard]] void* alloc (std::size_t sz) override; + void free (void* pt) override; + + // isDeviceAccessible and isHostAccessible can both be true. + [[nodiscard]] bool isDeviceAccessible () const override; + [[nodiscard]] bool isHostAccessible () const override; + + [[nodiscard]] bool isManaged () const override; + [[nodiscard]] bool isDevice () const override; + [[nodiscard]] bool isPinned () const override; + + [[nodiscard]] void* data () const noexcept { return (void*) m_root; } + + private: + DataAllocator m_dallocator; + char* m_root = nullptr; + char* m_free = nullptr; + std::size_t m_size = 0; + }; +} + [[nodiscard]] int nComp (FabArrayBase const& fa); [[nodiscard]] IntVect nGrowVect (FabArrayBase const& fa); [[nodiscard]] BoxArray const& boxArray (FabArrayBase const& fa); diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp index eb8fc99605b..c5168fb2ceb 100644 --- a/Src/Base/AMReX_FabArrayBase.cpp +++ b/Src/Base/AMReX_FabArrayBase.cpp @@ -86,6 +86,8 @@ FabArrayBase::FabArrayStats FabArrayBase::m_FA_stats; std::map FabArrayBase::m_mem_usage; std::vector FabArrayBase::m_region_tag; +bool FabArrayBase::m_alloc_single_chunk = false; + namespace { bool initialized = false; @@ -122,6 +124,9 @@ FabArrayBase::Initialize () MaxComp = 1; } + ParmParse ppmf("amrex.mf"); + ppmf.queryAdd("alloc_single_chunk", FabArrayBase::m_alloc_single_chunk); + amrex::ExecOnFinalize(FabArrayBase::Finalize); #ifdef AMREX_MEM_PROFILING @@ -2696,6 +2701,54 @@ FabArrayBase::flushParForCache () #endif +namespace detail { + + SingleChunkArena::SingleChunkArena (Arena* a_arena, std::size_t a_size) + : m_dallocator(a_arena), + m_root((char*)m_dallocator.alloc(a_size)), + m_free(m_root), + m_size(a_size) + {} + + SingleChunkArena::~SingleChunkArena () + { + if (m_root) { + m_dallocator.free(m_root); + } + } + + void* SingleChunkArena::alloc (std::size_t sz) + { + amrex::ignore_unused(m_size); + auto* p = (void*)m_free; + AMREX_ASSERT(sz <= m_size && ((m_free-m_root)+sz <= m_size)); + m_free += sz; + return p; + } + + void SingleChunkArena::free (void* /*pt*/) {} + + bool SingleChunkArena::isDeviceAccessible () const { + return m_dallocator.arena()->isDeviceAccessible(); + } + + bool SingleChunkArena::isHostAccessible () const { + return m_dallocator.arena()->isHostAccessible(); + } + + bool SingleChunkArena::isManaged () const { + return m_dallocator.arena()->isManaged(); + } + + bool SingleChunkArena::isDevice () const { + return m_dallocator.arena()->isDevice(); + } + + bool SingleChunkArena::isPinned () const { + return m_dallocator.arena()->isPinned(); + } +} + int nComp (FabArrayBase const& fa) { return fa.nComp(); diff --git a/Src/Base/AMReX_FabFactory.H b/Src/Base/AMReX_FabFactory.H index 01ee085f32f..7f65c21ce2b 100644 --- a/Src/Base/AMReX_FabFactory.H +++ b/Src/Base/AMReX_FabFactory.H @@ -8,6 +8,7 @@ #include #include #include +#include namespace amrex { @@ -59,8 +60,14 @@ public: AMREX_NODISCARD virtual FAB* create_alias (FAB const& /*rhs*/, int /*scomp*/, int /*ncomp*/) const { return nullptr; } virtual void destroy (FAB* fab) const = 0; - AMREX_NODISCARD - virtual FabFactory* clone () const = 0; + AMREX_NODISCARD virtual FabFactory* clone () const = 0; + AMREX_NODISCARD virtual Long nBytes (const Box& box, int ncomps, int /*box_index*/) const { + if constexpr (IsBaseFab_v) { + return box.numPts() * ncomps * Long(sizeof(typename FAB::value_type)); + } else { + return -1; + } + } }; template diff --git a/Src/EB/AMReX_MultiCutFab.H b/Src/EB/AMReX_MultiCutFab.H index a7042b8e129..3548cd79fe4 100644 --- a/Src/EB/AMReX_MultiCutFab.H +++ b/Src/EB/AMReX_MultiCutFab.H @@ -144,8 +144,6 @@ private: FabArray m_data; const FabArray* m_cellflags = nullptr; - - void remove (); }; } diff --git a/Src/EB/AMReX_MultiCutFab.cpp b/Src/EB/AMReX_MultiCutFab.cpp index 5bcbed1cfbe..b3987cca102 100644 --- a/Src/EB/AMReX_MultiCutFab.cpp +++ b/Src/EB/AMReX_MultiCutFab.cpp @@ -8,14 +8,51 @@ namespace amrex { +namespace detail { + class CutFabFactory final + : public DefaultFabFactory + { + public: + CutFabFactory (const FabArray* a_cellflags) + : m_cellflags(a_cellflags) + {} + + AMREX_NODISCARD + CutFab* create (const Box& box, int ncomps, const FabInfo& info, int box_index) const override + { + if ((*m_cellflags)[box_index].getType() == FabType::singlevalued) { + return new CutFab(box, ncomps, info.alloc, info.shared, info.arena); + } else { + return new CutFab(); + } + } + + AMREX_NODISCARD + CutFabFactory* clone () const override { return new CutFabFactory(m_cellflags); } + + AMREX_NODISCARD + Long nBytes (const Box& box, int ncomps, int box_index) const override + { + if ((*m_cellflags)[box_index].getType() == FabType::singlevalued) { + return box.numPts() * ncomps * Long(sizeof(Real)); + } else { + return Long(0); + } + } + + private: + const FabArray* m_cellflags = nullptr; + }; +} + + MultiCutFab::MultiCutFab () = default; MultiCutFab::MultiCutFab (const BoxArray& ba, const DistributionMapping& dm, int ncomp, int ngrow, const FabArray& cellflags) - : m_data(ba,dm,ncomp,ngrow,MFInfo(),DefaultFabFactory()), + : m_data(ba,dm,ncomp,ngrow,MFInfo(),detail::CutFabFactory(&cellflags)), m_cellflags(&cellflags) { - remove(); } MultiCutFab::~MultiCutFab () = default; @@ -24,21 +61,8 @@ void MultiCutFab::define (const BoxArray& ba, const DistributionMapping& dm, int ncomp, int ngrow, const FabArray& cellflags) { - m_data.define(ba,dm,ncomp,ngrow,MFInfo(),DefaultFabFactory()), + m_data.define(ba,dm,ncomp,ngrow,MFInfo(),detail::CutFabFactory(&cellflags)), m_cellflags = &cellflags; - remove(); -} - -void -MultiCutFab::remove () -{ - for (MFIter mfi(m_data); mfi.isValid(); ++mfi) - { - if (!ok(mfi)) - { - delete m_data.release(mfi); - } - } } const CutFab&