From 9348cac6b0705152caf0aa74498d9a3a7a277e11 Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Tue, 12 Dec 2023 15:48:52 -0800 Subject: [PATCH] Fix Resize Issue of Fab with the Async Arena (#3663) ## Summary Previously there was an issue with resizing Fabs using The_Async_Arena. The issue was the previous allocation during resize might be done on a different stream. This commit fixes the issue and makes the following patterns work. FArrayBox tmp0(The_Async_Arena()); FArrayBox tmp1(The_Async_Arena()); FArrayBox tmp2; for (MFIter ...) { tmp0.resize(box,ncomp,The_Async_Arena()); tmp1.resize(box,ncomp); tmp2.resize(box,ncomp,The_Async_Arena()); } ## Additional background https://github.com/AMReX-Astro/Castro/pull/2677 ## Checklist The proposed changes: - [x] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --- Src/Base/AMReX_Arena.H | 5 +++++ Src/Base/AMReX_BaseFab.H | 25 ++++++++++++++++++++++++- Src/Base/AMReX_GpuTypes.H | 1 + Src/Base/AMReX_PArena.H | 5 +++++ 4 files changed, 35 insertions(+), 1 deletion(-) diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H index e42ebdc9cd2..d328f693a96 100644 --- a/Src/Base/AMReX_Arena.H +++ b/Src/Base/AMReX_Arena.H @@ -157,6 +157,11 @@ public: */ virtual void registerForProfiling (const std::string& memory_name); +#ifdef AMREX_USE_GPU + //! Is this GPU stream ordered memory allocator? + [[nodiscard]] virtual bool isStreamOrderedArena () const { return false; } +#endif + /** * \brief Given a minimum required arena size of sz bytes, this returns * the next largest arena size that will align to align_size bytes diff --git a/Src/Base/AMReX_BaseFab.H b/Src/Base/AMReX_BaseFab.H index 006d7639ad6..e0331e7a0ae 100644 --- a/Src/Base/AMReX_BaseFab.H +++ b/Src/Base/AMReX_BaseFab.H @@ -1631,6 +1631,9 @@ protected: Long truesize = 0L; //!< nvar*numpts that was allocated on heap. bool ptr_owner = false; //!< Owner of T*? bool shared_memory = false; //!< Is the memory allocated in shared memory? +#ifdef AMREX_USE_GPU + gpuStream_t alloc_stream{}; +#endif }; template @@ -1902,6 +1905,9 @@ BaseFab::define () this->truesize = this->nvar*this->domain.numPts(); this->ptr_owner = true; this->dptr = static_cast(this->alloc(this->truesize*sizeof(T))); +#ifdef AMREX_USE_GPU + this->alloc_stream = Gpu::gpuStream(); +#endif placementNew(this->dptr, this->truesize); @@ -2003,6 +2009,9 @@ BaseFab::BaseFab (BaseFab&& rhs) noexcept dptr(rhs.dptr), domain(rhs.domain), nvar(rhs.nvar), truesize(rhs.truesize), ptr_owner(rhs.ptr_owner), shared_memory(rhs.shared_memory) +#ifdef AMREX_USE_GPU + , alloc_stream(rhs.alloc_stream) +#endif { rhs.dptr = nullptr; rhs.ptr_owner = false; @@ -2021,6 +2030,9 @@ BaseFab::operator= (BaseFab&& rhs) noexcept truesize = rhs.truesize; ptr_owner = rhs.ptr_owner; shared_memory = rhs.shared_memory; +#ifdef AMREX_USE_GPU + alloc_stream = rhs.alloc_stream; +#endif rhs.dptr = nullptr; rhs.ptr_owner = false; @@ -2062,7 +2074,11 @@ BaseFab::resize (const Box& b, int n, Arena* ar) this->dptr = nullptr; define(); } - else if (this->nvar*this->domain.numPts() > this->truesize) + else if (this->nvar*this->domain.numPts() > this->truesize +#ifdef AMREX_USE_GPU + || (arena()->isStreamOrderedArena() && alloc_stream != Gpu::gpuStream()) +#endif + ) { if (this->shared_memory) { amrex::Abort("BaseFab::resize: BaseFab in shared memory cannot increase size"); @@ -2114,7 +2130,14 @@ BaseFab::clear () noexcept placementDelete(this->dptr, this->truesize); +#ifdef AMREX_USE_GPU + auto current_stream = Gpu::Device::gpuStream(); + Gpu::Device::setStream(alloc_stream); +#endif this->free(this->dptr); +#ifdef AMREX_USE_GPU + Gpu::Device::setStream(current_stream); +#endif if (this->nvar > 1) { amrex::update_fab_stats(-this->truesize/this->nvar, -this->truesize, sizeof(T)); diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 8b5680b41b8..ecb992983ba 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -29,6 +29,7 @@ struct Dim1 { struct gpuStream_t { sycl::queue* queue = nullptr; bool operator== (gpuStream_t const& rhs) noexcept { return queue == rhs.queue; } + bool operator!= (gpuStream_t const& rhs) noexcept { return queue != rhs.queue; } }; #endif diff --git a/Src/Base/AMReX_PArena.H b/Src/Base/AMReX_PArena.H index cc221ba7bae..d2e8c8ebec3 100644 --- a/Src/Base/AMReX_PArena.H +++ b/Src/Base/AMReX_PArena.H @@ -38,6 +38,11 @@ public: [[nodiscard]] bool isDevice () const final; [[nodiscard]] bool isPinned () const final; +#ifdef AMREX_USE_GPU + //! Is this CUDA stream ordered memory allocator? + [[nodiscard]] bool isStreamOrderedArena () const final { return true; } +#endif + #ifdef AMREX_CUDA_GE_11_2 private: cudaMemPool_t m_pool;