From 9e354e2af4412c290cd4bfba4913098c04381175 Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Wed, 30 Sep 2020 16:34:29 -0400 Subject: [PATCH] Reduce the number of allocated signals Enable this optimization when the barrier is disabled, since reuse requires a signal wait. Use the size of pending AQL signals as the size of signal pool. Change-Id: I2754a0f8b67e19d2601c58945e10fdf0e8be1624 (cherry picked from commit 86fa3720878bbebcdd09b773d9c1c1f5e5c8c652) --- device/rocm/rocvirtual.cpp | 43 +++++++++++++++++++++++++++++++++----- device/rocm/rocvirtual.hpp | 1 + 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/device/rocm/rocvirtual.cpp b/device/rocm/rocvirtual.cpp index 98a74fbac..b8885e1dc 100644 --- a/device/rocm/rocvirtual.cpp +++ b/device/rocm/rocvirtual.cpp @@ -455,8 +455,21 @@ bool VirtualGPU::dispatchGenericAqlPacket( // TODO: placeholder to setup the kernel to populate start and end timestamp. if (timestamp_ != nullptr) { + // Pool size must grow to the size of pending AQL packets + const uint32_t pool_size = index - read; + if (pool_size >= signal_pool_.size()) { + ProfilingSignal profiling_signal = {}; + if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profiling_signal.signal_)) { + LogPrintfError("Failed signal allocation id = %d", pool_size); + return false; + } + signal_pool_.push_back(profiling_signal); + assert(queueSize >= signal_pool_.size() && "Pool will be reallocated!"); + } + // Move index inside the valid pool + ++current_signal_ %= signal_pool_.size(); // Find signal slot - ProfilingSignal* profilingSignal = &signal_pool_[index & queueMask]; + ProfilingSignal* profilingSignal = &signal_pool_[current_signal_]; // Make sure we save the old results in the TS structure if (profilingSignal->ts_ != nullptr) { profilingSignal->ts_->checkGpuTime(); @@ -536,6 +549,8 @@ bool VirtualGPU::dispatchGenericAqlPacket( LogPrintfError("Failed signal [0x%lx] wait", signal.handle); return false; } + // Reset the pool of signals + current_signal_ = 0; } return true; @@ -620,8 +635,14 @@ void VirtualGPU::ResetQueueStates() { // Release all memory dependencies memoryDependency().clear(); - // Release the pool, since runtime just completed a barrier - resetKernArgPool(); + if (dev().settings().barrier_sync_) { + // Release the pool, since runtime just completed a barrier + // @note: Runtime can reset kernel arg pool only if the barrier with L2 invalidation was issued + resetKernArgPool(); + } else { + // Reset the pool of signals + current_signal_ = 0; + } } // ================================================================================================ @@ -833,8 +854,17 @@ bool VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count) { } if (signal_pool_count != 0) { - signal_pool_.resize(signal_pool_count); - for (uint i = 0; i < signal_pool_count; ++i) { + // Reserve signal pool for all entries in the queue, since profiling logic will save the + // pointer in timestamp info for the future references + signal_pool_.reserve(signal_pool_count); + // If barrier is disable, then allocate a small portion of all signals and grow the array later. + // @note: the optimization requires a wait for signal on reuse, which is only available when + // the barrier is disabled + constexpr uint32_t kDefaultSignalPoolSize = 32; + const uint32_t default_signal_pool_size = (dev().settings().barrier_sync_) ? + signal_pool_count : kDefaultSignalPoolSize; + signal_pool_.resize(default_signal_pool_size); + for (uint i = 0; i < default_signal_pool_size; ++i) { ProfilingSignal profilingSignal; if (HSA_STATUS_SUCCESS != hsa_signal_create(0, 0, nullptr, &profilingSignal.signal_)) { return false; @@ -881,6 +911,9 @@ void* VirtualGPU::allocKernArg(size_t size, size_t alignment) { } resetKernArgPool(); + + // Reset the pool of signals + current_signal_ = 0; } } while (true); diff --git a/device/rocm/rocvirtual.hpp b/device/rocm/rocvirtual.hpp index b05a787e3..0941dd3db 100644 --- a/device/rocm/rocvirtual.hpp +++ b/device/rocm/rocvirtual.hpp @@ -388,6 +388,7 @@ class VirtualGPU : public device::VirtualDevice { uint kernarg_pool_cur_offset_; std::vector signal_pool_; //!< Pool of signals for profiling + uint32_t current_signal_ = 0; //!< Current avaialble signal in the pool friend class Timestamp; // PM4 packet for gfx8 performance counter