Skip to content

Commit

Permalink
FabArray: Option to use a single contiguous chunk of memory (#3857)
Browse files Browse the repository at this point in the history
This adds an option to use a single contiguous chunk of memory for all
the data in Fabs of a FabArray/MultiFab/iMultiFab. One can change the
strategy for an individual MultiFab via
MFInfo::SetAllocSingleChunk(bool) and for all MultiFabs by default via
ParmParse parameter, amrex.mf.alloc_single_chunk=1.

This is considered an experimental feature. Please let us know if you
notice any issues.
  • Loading branch information
WeiqunZhang authored Mar 29, 2024
1 parent 67523bb commit 1be7257
Show file tree
Hide file tree
Showing 8 changed files with 215 additions and 28 deletions.
24 changes: 20 additions & 4 deletions Docs/sphinx_documentation/source/Basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2735,10 +2735,26 @@ covered by fine level grids.
Memory Allocation
=================

Some constructors of :cpp:`MultiFab`, :cpp:`FArrayBox`, etc. can take
an :cpp:`Arena` argument for memory allocation. This is usually not
important for CPU codes, but very important for GPU codes. We will
present more details in :ref:`sec:gpu:memory` in Chapter GPU.
Some constructors of :cpp:`MultiFab`, :cpp:`FArrayBox`, etc. can take an
:cpp:`Arena` argument for memory allocation. Some constructors of
:cpp:`MultiFab` can take an optional argument :cpp:`MFInfo`, which can be
used to set the arena. This is usually not important for CPU codes, but
very important for GPU codes. We will present more details about memory
arenas in :ref:`sec:gpu:memory` in Chapter GPU.

Every :cpp:`FArrayBox` in a :cpp:`MultiFab` has a contiguous chunk of memory
for floating point data, whereas by default :cpp:`MultiFab` as a collection
of multiple :cpp:`FArrayBox`\ s does not store all floating point data in
contiguous chunk of memory. This behavior can be changed for all
:cpp:`MultiFab`\ s with the :cpp:`ParmParse` parameter,
``amrex.mf.alloc_single_chunk=1``, or for a specific :cpp:`MultiFab` by
passing a :cpp:`MFInfo` object (e.g.,
``MFInfo().SetAllocSingleChunk(true)``) to the constructor. One can call
:cpp:`MultiFab::singleChunkPtr()` to obtain a pointer to the single chunk
memory. Note that the function returns a null pointer if the :cpp:`MultiFab`
does not use a single contiguous chunk of memory. One can also call
:cpp:`MultiFab::singleChunkSize()` to obtain the size in bytes of the single
chunk memory.

AMReX has a Fortran module, :fortran:`amrex_mempool_module` that can be used to
allocate memory for Fortran pointers. The reason that such a module exists in
Expand Down
2 changes: 1 addition & 1 deletion Src/Base/AMReX_FArrayBox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,7 @@ FABio_8bit::write (std::ostream& os,
const Real mn = f.min<RunOn::Host>(k+comp);
const Real mx = f.max<RunOn::Host>(k+comp);
const Real* dat = f.dataPtr(k+comp);
Real rng = std::fabs(mx-mn);
Real rng = std::abs(mx-mn);
rng = (rng < eps) ? 0.0_rt : 255.0_rt/(mx-mn);
for(Long i(0); i < siz; ++i) {
Real v = rng*(dat[i]-mn);
Expand Down
56 changes: 53 additions & 3 deletions Src/Base/AMReX_FabArray.H
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,14 @@ Long nBytesOwned (BaseFab<T> const& fab) noexcept { return fab.nBytesOwned(); }
struct MFInfo {
// alloc: allocate memory or not
bool alloc = true;
bool alloc_single_chunk = FabArrayBase::getAllocSingleChunk();
Arena* arena = nullptr;
Vector<std::string> tags;

MFInfo& SetAlloc (bool a) noexcept { alloc = a; return *this; }

MFInfo& SetAllocSingleChunk (bool a) noexcept { alloc_single_chunk = a; return *this; }

MFInfo& SetArena (Arena* ar) noexcept { arena = ar; return *this; }

MFInfo& SetTag () noexcept { return *this; }
Expand Down Expand Up @@ -436,6 +439,22 @@ public:
#endif
}

//! Return the data pointer to the single chunk memory if this object
//! uses a single contiguous chunk of memory, nullptr otherwise.
[[nodiscard]] value_type* singleChunkPtr () noexcept {
return m_single_chunk_arena ? (value_type*)m_single_chunk_arena->data() : nullptr;
}

//! Return the data pointer to the single chunk memory if this object
//! uses a single contiguous chunk of memory, nullptr otherwise.
[[nodiscard]] value_type const* singleChunkPtr () const noexcept {
return m_single_chunk_arena ? (value_type const*)m_single_chunk_arena->data() : nullptr;
}

//! Return the size of the single chunk memory if this object uses a
//! single contiguous chunk of memory, 0 otherwise.
[[nodiscard]] std::size_t singleChunkSize () const noexcept { return m_single_chunk_size; }

bool isAllRegular () const noexcept {
#ifdef AMREX_USE_EB
const auto *const f = dynamic_cast<EBFArrayBoxFactory const*>(m_factory.get());
Expand Down Expand Up @@ -1233,6 +1252,8 @@ protected:

std::unique_ptr<FabFactory<FAB> > m_factory;
DataAllocator m_dallocator;
std::unique_ptr<detail::SingleChunkArena> m_single_chunk_arena;
Long m_single_chunk_size = 0;

//! has define() been called?
bool define_function_called = false;
Expand Down Expand Up @@ -1306,7 +1327,8 @@ private:
using Iterator = typename std::vector<FAB*>::iterator;

void AllocFabs (const FabFactory<FAB>& factory, Arena* ar,
const Vector<std::string>& tags);
const Vector<std::string>& tags,
bool alloc_single_chunk);

void setFab_assert (int K, FAB const& fab) const;

Expand Down Expand Up @@ -1696,6 +1718,7 @@ FabArray<FAB>::release (int K)
{
const int li = localindex(K);
if (li >= 0 && li < static_cast<int>(m_fabs_v.size()) && m_fabs_v[li] != nullptr) {
AMREX_ASSERT(m_single_chunk_arena == nullptr);
Long nbytes = amrex::nBytesOwned(*m_fabs_v[li]);
if (nbytes > 0) {
for (auto const& t : m_tags) {
Expand All @@ -1715,6 +1738,7 @@ FabArray<FAB>::release (const MFIter& mfi)
{
const int li = mfi.LocalIndex();
if (li >= 0 && li < static_cast<int>(m_fabs_v.size()) && m_fabs_v[li] != nullptr) {
AMREX_ASSERT(m_single_chunk_arena == nullptr);
Long nbytes = amrex::nBytesOwned(*m_fabs_v[li]);
if (nbytes > 0) {
for (auto const& t : m_tags) {
Expand Down Expand Up @@ -1755,6 +1779,12 @@ FabArray<FAB>::clear ()
updateMemUsage(t, -nbytes, nullptr);
}
}

if (m_single_chunk_arena) {
m_single_chunk_arena.reset();
}
m_single_chunk_size = 0;

m_tags.clear();

FabArrayBase::clear();
Expand Down Expand Up @@ -1880,6 +1910,8 @@ FabArray<FAB>::FabArray (FabArray<FAB>&& rhs) noexcept
: FabArrayBase (static_cast<FabArrayBase&&>(rhs))
, m_factory (std::move(rhs.m_factory))
, m_dallocator (std::move(rhs.m_dallocator))
, m_single_chunk_arena(std::move(rhs.m_single_chunk_arena))
, m_single_chunk_size(std::exchange(rhs.m_single_chunk_size,0))
, define_function_called(rhs.define_function_called)
, m_fabs_v (std::move(rhs.m_fabs_v))
#ifdef AMREX_USE_GPU
Expand Down Expand Up @@ -1909,6 +1941,8 @@ FabArray<FAB>::operator= (FabArray<FAB>&& rhs) noexcept
FabArrayBase::operator=(static_cast<FabArrayBase&&>(rhs));
m_factory = std::move(rhs.m_factory);
m_dallocator = std::move(rhs.m_dallocator);
m_single_chunk_arena = std::move(rhs.m_single_chunk_arena);
std::swap(m_single_chunk_size, rhs.m_single_chunk_size);
define_function_called = rhs.define_function_called;
std::swap(m_fabs_v, rhs.m_fabs_v);
#ifdef AMREX_USE_GPU
Expand Down Expand Up @@ -2008,7 +2042,7 @@ FabArray<FAB>::define (const BoxArray& bxs,
addThisBD();

if(info.alloc) {
AllocFabs(*m_factory, m_dallocator.m_arena, info.tags);
AllocFabs(*m_factory, m_dallocator.m_arena, info.tags, info.alloc_single_chunk);
#ifdef BL_USE_TEAM
ParallelDescriptor::MyTeam().MemoryBarrier();
#endif
Expand All @@ -2018,8 +2052,11 @@ FabArray<FAB>::define (const BoxArray& bxs,
template <class FAB>
void
FabArray<FAB>::AllocFabs (const FabFactory<FAB>& factory, Arena* ar,
const Vector<std::string>& tags)
const Vector<std::string>& tags, bool alloc_single_chunk)
{
if (shmem.alloc) { alloc_single_chunk = false; }
if constexpr (!IsBaseFab_v<FAB>) { alloc_single_chunk = false; }

const int n = indexArray.size();
const int nworkers = ParallelDescriptor::TeamSize();
shmem.alloc = (nworkers > 1);
Expand All @@ -2029,6 +2066,18 @@ FabArray<FAB>::AllocFabs (const FabFactory<FAB>& factory, Arena* ar,
FabInfo fab_info;
fab_info.SetAlloc(alloc).SetShared(shmem.alloc).SetArena(ar);

if (alloc_single_chunk) {
m_single_chunk_size = 0L;
for (int i = 0; i < n; ++i) {
int K = indexArray[i];
const Box& tmpbox = fabbox(K);
m_single_chunk_size += factory.nBytes(tmpbox, n_comp, K);
}
AMREX_ASSERT(m_single_chunk_size >= 0); // 0 is okay.
m_single_chunk_arena = std::make_unique<detail::SingleChunkArena>(ar, m_single_chunk_size);
fab_info.SetArena(m_single_chunk_arena.get());
}

m_fabs_v.reserve(n);

Long nbytes = 0L;
Expand Down Expand Up @@ -2136,6 +2185,7 @@ FabArray<FAB>::setFab_assert (int K, FAB const& fab) const
AMREX_ASSERT(!boxarray.empty());
AMREX_ASSERT(fab.box() == fabbox(K));
AMREX_ASSERT(distributionMap[K] == ParallelDescriptor::MyProc());
AMREX_ASSERT(m_single_chunk_arena == nullptr);
}

template <class FAB>
Expand Down
39 changes: 39 additions & 0 deletions Src/Base/AMReX_FabArrayBase.H
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <AMReX_Config.H>

#include <AMReX_BoxArray.H>
#include <AMReX_DataAllocator.H>
#include <AMReX_DistributionMapping.H>
#include <AMReX_ParallelDescriptor.H>
#include <AMReX_ParallelReduce.H>
Expand Down Expand Up @@ -721,8 +722,46 @@ public:
};
static AMREX_EXPORT FabArrayStats m_FA_stats;

static AMREX_EXPORT bool m_alloc_single_chunk;

[[nodiscard]] static bool getAllocSingleChunk () { return m_alloc_single_chunk; }
};

namespace detail {
class SingleChunkArena final
: public Arena
{
public:
SingleChunkArena (Arena* a_arena, std::size_t a_size);
~SingleChunkArena () override;

SingleChunkArena () = delete;
SingleChunkArena (const SingleChunkArena& rhs) = delete;
SingleChunkArena (SingleChunkArena&& rhs) = delete;
SingleChunkArena& operator= (const SingleChunkArena& rhs) = delete;
SingleChunkArena& operator= (SingleChunkArena&& rhs) = delete;

[[nodiscard]] void* alloc (std::size_t sz) override;
void free (void* pt) override;

// isDeviceAccessible and isHostAccessible can both be true.
[[nodiscard]] bool isDeviceAccessible () const override;
[[nodiscard]] bool isHostAccessible () const override;

[[nodiscard]] bool isManaged () const override;
[[nodiscard]] bool isDevice () const override;
[[nodiscard]] bool isPinned () const override;

[[nodiscard]] void* data () const noexcept { return (void*) m_root; }

private:
DataAllocator m_dallocator;
char* m_root = nullptr;
char* m_free = nullptr;
std::size_t m_size = 0;
};
}

[[nodiscard]] int nComp (FabArrayBase const& fa);
[[nodiscard]] IntVect nGrowVect (FabArrayBase const& fa);
[[nodiscard]] BoxArray const& boxArray (FabArrayBase const& fa);
Expand Down
53 changes: 53 additions & 0 deletions Src/Base/AMReX_FabArrayBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ FabArrayBase::FabArrayStats FabArrayBase::m_FA_stats;
std::map<std::string,FabArrayBase::meminfo> FabArrayBase::m_mem_usage;
std::vector<std::string> FabArrayBase::m_region_tag;

bool FabArrayBase::m_alloc_single_chunk = false;

namespace
{
bool initialized = false;
Expand Down Expand Up @@ -122,6 +124,9 @@ FabArrayBase::Initialize ()
MaxComp = 1;
}

ParmParse ppmf("amrex.mf");
ppmf.queryAdd("alloc_single_chunk", FabArrayBase::m_alloc_single_chunk);

amrex::ExecOnFinalize(FabArrayBase::Finalize);

#ifdef AMREX_MEM_PROFILING
Expand Down Expand Up @@ -2696,6 +2701,54 @@ FabArrayBase::flushParForCache ()

#endif

namespace detail {

SingleChunkArena::SingleChunkArena (Arena* a_arena, std::size_t a_size)
: m_dallocator(a_arena),
m_root((char*)m_dallocator.alloc(a_size)),
m_free(m_root),
m_size(a_size)
{}

SingleChunkArena::~SingleChunkArena ()
{
if (m_root) {
m_dallocator.free(m_root);
}
}

void* SingleChunkArena::alloc (std::size_t sz)
{
amrex::ignore_unused(m_size);
auto* p = (void*)m_free;
AMREX_ASSERT(sz <= m_size && ((m_free-m_root)+sz <= m_size));
m_free += sz;
return p;
}

void SingleChunkArena::free (void* /*pt*/) {}

bool SingleChunkArena::isDeviceAccessible () const {
return m_dallocator.arena()->isDeviceAccessible();
}

bool SingleChunkArena::isHostAccessible () const {
return m_dallocator.arena()->isHostAccessible();
}

bool SingleChunkArena::isManaged () const {
return m_dallocator.arena()->isManaged();
}

bool SingleChunkArena::isDevice () const {
return m_dallocator.arena()->isDevice();
}

bool SingleChunkArena::isPinned () const {
return m_dallocator.arena()->isPinned();
}
}

int nComp (FabArrayBase const& fa)
{
return fa.nComp();
Expand Down
11 changes: 9 additions & 2 deletions Src/Base/AMReX_FabFactory.H
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <AMReX_MakeType.H>
#include <AMReX_Vector.H>
#include <AMReX_Arena.H>
#include <AMReX_TypeTraits.H>

namespace amrex
{
Expand Down Expand Up @@ -59,8 +60,14 @@ public:
AMREX_NODISCARD
virtual FAB* create_alias (FAB const& /*rhs*/, int /*scomp*/, int /*ncomp*/) const { return nullptr; }
virtual void destroy (FAB* fab) const = 0;
AMREX_NODISCARD
virtual FabFactory<FAB>* clone () const = 0;
AMREX_NODISCARD virtual FabFactory<FAB>* clone () const = 0;
AMREX_NODISCARD virtual Long nBytes (const Box& box, int ncomps, int /*box_index*/) const {
if constexpr (IsBaseFab_v<FAB>) {
return box.numPts() * ncomps * Long(sizeof(typename FAB::value_type));
} else {
return -1;
}
}
};

template <class FAB>
Expand Down
2 changes: 0 additions & 2 deletions Src/EB/AMReX_MultiCutFab.H
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,6 @@ private:

FabArray<CutFab> m_data;
const FabArray<EBCellFlagFab>* m_cellflags = nullptr;

void remove ();
};

}
Expand Down
Loading

0 comments on commit 1be7257

Please sign in to comment.