From f06d662bc54179bcbe12629132f9df10af5dc8ce Mon Sep 17 00:00:00 2001 From: Simone Balducci Date: Tue, 19 Mar 2024 13:08:25 +0100 Subject: [PATCH 1/5] Switch to `fedora-latest` --- .github/workflows/clang_format.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml index 496c6fef..fa28cfd5 100644 --- a/.github/workflows/clang_format.yml +++ b/.github/workflows/clang_format.yml @@ -9,7 +9,7 @@ on: jobs: formatting-check: name: Formatting Check - runs-on: ubuntu-latest + runs-on: fedora-latest strategy: matrix: path: From 2e7bfe3723c63ca2f89e2e2574f707282b0840b7 Mon Sep 17 00:00:00 2001 From: Simone Balducci Date: Tue, 19 Mar 2024 13:08:37 +0100 Subject: [PATCH 2/5] Update to `clang-format-17` --- .github/workflows/clang_format.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml index fa28cfd5..6429bc8c 100644 --- a/.github/workflows/clang_format.yml +++ b/.github/workflows/clang_format.yml @@ -19,6 +19,6 @@ jobs: - name: Run clang-format style check uses: jidicula/clang-format-action@v4.11.0 with: - clang-format-version: '16' + clang-format-version: '17' check-path: ${{ matrix.path }} exclude-regex: 'CLUEstering/include/test/doctest.h' From ce08d26f574e8ac43ccbfc5336c5b3dab4c64b2c Mon Sep 17 00:00:00 2001 From: Simone Balducci Date: Tue, 19 Mar 2024 13:32:52 +0100 Subject: [PATCH 3/5] Go back to ubuntu --- .github/workflows/clang_format.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml index 6429bc8c..50627311 100644 --- a/.github/workflows/clang_format.yml +++ b/.github/workflows/clang_format.yml @@ -9,7 +9,7 @@ on: jobs: formatting-check: name: Formatting Check - runs-on: fedora-latest + runs-on: ubuntu-latest strategy: matrix: path: From 9e823d6c88f441fae97bb825c506b4b4f8ca1645 Mon Sep 17 00:00:00 2001 From: Simone Balducci Date: Tue, 19 Mar 2024 13:35:40 +0100 Subject: [PATCH 4/5] Fix `clang-format` file --- .clang-format | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.clang-format b/.clang-format index ff55ab27..6935cfbd 100644 --- a/.clang-format +++ b/.clang-format @@ -1,7 +1,7 @@ --- Language: Cpp BasedOnStyle: Google -ColumnLimit: 108 +ColumnLimit: 90 NamespaceIndentation: All SortIncludes: false IndentWidth: 2 @@ -11,8 +11,10 @@ PenaltyExcessCharacter: 100 AlignAfterOpenBracket: Align AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false -BinPackParameters: false AlwaysBreakTemplateDeclarations: Yes ReflowComments: false BinPackArguments: false BinPackParameters: false +DerivePointerAlignment: false +PointerAlignment: Left +ReferenceAlignment: Left From b68c7dd34bfed2e57758507410c7358a00daeddd Mon Sep 17 00:00:00 2001 From: Simone Balducci Date: Tue, 19 Mar 2024 13:38:48 +0100 Subject: [PATCH 5/5] Formatting --- .../alpaka/AlpakaCore/AllocatorPolicy.h | 3 +- .../alpaka/AlpakaCore/CachedBufAlloc.h | 22 ++- .../alpaka/AlpakaCore/CachingAllocator.h | 105 +++++----- CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h | 24 ++- CLUEstering/alpaka/AlpakaCore/alpakaConfig.h | 3 +- CLUEstering/alpaka/AlpakaCore/alpakaDevices.h | 3 +- CLUEstering/alpaka/AlpakaCore/alpakaFwd.h | 12 +- CLUEstering/alpaka/AlpakaCore/alpakaMemory.h | 72 ++++--- CLUEstering/alpaka/AlpakaCore/alpakaWorkDiv.h | 81 +++++--- CLUEstering/alpaka/AlpakaCore/backend.h | 3 +- .../AlpakaCore/getDeviceCachingAllocator.h | 3 +- .../alpaka/AlpakaCore/getDeviceIndex.h | 8 +- .../AlpakaCore/getHostCachingAllocator.h | 17 +- .../alpaka/BindingModules/binding_cpu.cc | 154 +++++++++------ .../alpaka/BindingModules/binding_cpu_tbb.cc | 150 +++++++++------ .../alpaka/BindingModules/binding_gpu_cuda.cc | 150 +++++++++------ .../alpaka/BindingModules/binding_gpu_hip.cc | 154 +++++++++------ .../alpaka/BindingModules/binding_kernels.cc | 6 +- CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h | 99 ++++++---- CLUEstering/alpaka/CLUE/CLUEAlpakaKernels.h | 70 +++++-- CLUEstering/alpaka/CLUE/ConvolutionalKernel.h | 20 +- CLUEstering/alpaka/CLUE/Run.h | 180 +++++++++--------- CLUEstering/alpaka/DataFormats/Points.h | 12 +- .../DataFormats/alpaka/AlpakaVecArray.h | 24 +-- .../alpaka/DataFormats/alpaka/PointsAlpaka.h | 3 +- .../alpaka/DataFormats/alpaka/TilesAlpaka.h | 29 +-- 26 files changed, 851 insertions(+), 556 deletions(-) diff --git a/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h b/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h index 4561d91f..d76b7d21 100644 --- a/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h +++ b/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h @@ -14,7 +14,8 @@ namespace cms::alpakatools { template constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous; -#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED +#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || \ + defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED template <> constexpr inline AllocatorPolicy allocator_policy = #if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR diff --git a/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h b/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h index ca9347ee..53f9952d 100644 --- a/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h +++ b/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h @@ -39,7 +39,12 @@ namespace cms::alpakatools { //! The caching memory allocator implementation for the pinned host memory template - struct CachedBufAlloc { + struct CachedBufAlloc { template ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, alpaka::QueueCudaRtNonBlocking queue, @@ -96,7 +101,12 @@ namespace cms::alpakatools { //! The caching memory allocator implementation for the pinned host memory template - struct CachedBufAlloc { + struct CachedBufAlloc { template ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, alpaka::QueueHipRtNonBlocking queue, @@ -152,9 +162,11 @@ namespace cms::alpakatools { } // namespace traits template - ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) { - return traits::CachedBufAlloc, TIdx, TDev, TQueue>::allocCachedBuf( - dev, queue, extent); + ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, + TQueue queue, + TExtent const& extent = TExtent()) { + return traits::CachedBufAlloc, TIdx, TDev, TQueue>:: + allocCachedBuf(dev, queue, extent); } } // namespace cms::alpakatools diff --git a/CLUEstering/alpaka/AlpakaCore/CachingAllocator.h b/CLUEstering/alpaka/AlpakaCore/CachingAllocator.h index 51023b85..4d3107ec 100644 --- a/CLUEstering/alpaka/AlpakaCore/CachingAllocator.h +++ b/CLUEstering/alpaka/AlpakaCore/CachingAllocator.h @@ -85,14 +85,16 @@ namespace cms::alpakatools { template class CachingAllocator { public: - using Device = TDevice; // the "memory device", where the memory will be allocated - using Queue = TQueue; // the queue used to submit the memory operations + using Device = TDevice; // the "memory device", where the memory will be allocated + using Queue = TQueue; // the queue used to submit the memory operations using Event = alpaka::Event; // the events used to synchronise the operations using Buffer = alpaka::Buf, size_t>; // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU. - static_assert(std::is_same_v> or std::is_same_v, - "The \"memory device\" type can either be the same as the \"synchronisation device\" " + static_assert(std::is_same_v> or + std::is_same_v, + "The \"memory device\" type can either be the same as the " + "\"synchronisation device\" " "type, or be the " "host CPU."); @@ -112,8 +114,8 @@ namespace cms::alpakatools { size_t maxCachedBytes, // total storage for the allocator (0 means no limit); double maxCachedFraction, // fraction of total device memory taken for the allocator (0 means no limit); - // if both maxCachedBytes and maxCachedFraction are non-zero, - // the smallest resulting value is used. + // if both maxCachedBytes and maxCachedFraction are non-zero, + // the smallest resulting value is used. bool reuseSameQueueAllocations, // reuse non-ready allocations if they are in the same queue as the new one; // this is safe only if all memory operations are scheduled in the same queue bool debug) @@ -135,7 +137,8 @@ namespace cms::alpakatools { << " resulting bins:\n"; for (auto bin = minBin_; bin <= maxBin_; ++bin) { auto binSize = detail::power(binGrowth, bin); - out << " " << std::right << std::setw(12) << detail::as_bytes(binSize) << '\n'; + out << " " << std::right << std::setw(12) << detail::as_bytes(binSize) + << '\n'; } out << " maximum amount of cached memory: " << detail::as_bytes(maxCachedBytes_); std::cout << out.str() << std::endl; @@ -201,22 +204,26 @@ namespace cms::alpakatools { if (debug_) { std::ostringstream out; - out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " returned " << block.bytes - << " bytes at " << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() - << " , event " << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() - << " available blocks cached (" << cachedBytes_.free << " bytes), " << liveBlocks_.size() - << " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " returned " + << block.bytes << " bytes at " << ptr << " from associated queue " + << block.queue->m_spQueueImpl.get() << " , event " + << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() + << " available blocks cached (" << cachedBytes_.free << " bytes), " + << liveBlocks_.size() << " live blocks (" << cachedBytes_.live + << " bytes) outstanding." << std::endl; std::cout << out.str() << std::endl; } } else { // if the buffer is not recached, it is automatically freed when block goes out of scope if (debug_) { std::ostringstream out; - out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes - << " bytes at " << ptr << " from associated queue " << block.queue->m_spQueueImpl.get() - << ", event " << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() - << " available blocks cached (" << cachedBytes_.free << " bytes), " << liveBlocks_.size() - << " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " + << block.bytes << " bytes at " << ptr << " from associated queue " + << block.queue->m_spQueueImpl.get() << ", event " + << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size() + << " available blocks cached (" << cachedBytes_.free << " bytes), " + << liveBlocks_.size() << " live blocks (" << cachedBytes_.live + << " bytes) outstanding." << std::endl; std::cout << out.str() << std::endl; } } @@ -257,10 +264,11 @@ namespace cms::alpakatools { return std::make_tuple(minBin_, minBinBytes_); } if (bytes > maxBinBytes_) { - throw std::runtime_error("Requested allocation size " + std::to_string(bytes) + - " bytes is too large for the caching detail with maximum bin " + - std::to_string(maxBinBytes_) + - " bytes. You might want to increase the maximum bin size"); + throw std::runtime_error( + "Requested allocation size " + std::to_string(bytes) + + " bytes is too large for the caching detail with maximum bin " + + std::to_string(maxBinBytes_) + + " bytes. You might want to increase the maximum bin size"); } unsigned int bin = minBin_; size_t binBytes = minBinBytes_; @@ -301,11 +309,13 @@ namespace cms::alpakatools { if (debug_) { std::ostringstream out; - out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " reused cached block at " - << block.buffer->data() << " (" << block.bytes << " bytes) for queue " - << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get() - << " (previously associated with stream " << iBlock->second.queue->m_spQueueImpl.get() - << " , event " << iBlock->second.event->m_spEventImpl.get() << ")." << std::endl; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) + << " reused cached block at " << block.buffer->data() << " (" + << block.bytes << " bytes) for queue " << block.queue->m_spQueueImpl.get() + << ", event " << block.event->m_spEventImpl.get() + << " (previously associated with stream " + << iBlock->second.queue->m_spQueueImpl.get() << " , event " + << iBlock->second.event->m_spEventImpl.get() << ")." << std::endl; std::cout << out.str() << std::endl; } @@ -324,11 +334,14 @@ namespace cms::alpakatools { return alpaka::allocBuf(device_, bytes); } else if constexpr (std::is_same_v) { // allocate pinned host memory accessible by the queue's platform - return alpaka::allocMappedBuf>, std::byte, size_t>(device_, bytes); + return alpaka::allocMappedBuf>, std::byte, size_t>( + device_, bytes); } else { // unsupported combination - static_assert(std::is_same_v> or std::is_same_v, - "The \"memory device\" type can either be the same as the \"synchronisation device\" " + static_assert(std::is_same_v> or + std::is_same_v, + "The \"memory device\" type can either be the same as the " + "\"synchronisation device\" " "type, or be " "the host CPU."); } @@ -341,8 +354,9 @@ namespace cms::alpakatools { // the allocation attempt failed: free all cached blocks on the device and retry if (debug_) { std::ostringstream out; - out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " failed to allocate " - << block.bytes << " bytes for queue " << block.queue->m_spQueueImpl.get() + out << "\t" << deviceType_ << " " << alpaka::getName(device_) + << " failed to allocate " << block.bytes << " bytes for queue " + << block.queue->m_spQueueImpl.get() << ", retrying after freeing cached allocations" << std::endl; std::cout << out.str() << std::endl; } @@ -366,10 +380,10 @@ namespace cms::alpakatools { if (debug_) { std::ostringstream out; - out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " allocated new block at " - << block.buffer->data() << " (" << block.bytes << " bytes associated with queue " - << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get() << "." - << std::endl; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) + << " allocated new block at " << block.buffer->data() << " (" << block.bytes + << " bytes associated with queue " << block.queue->m_spQueueImpl.get() + << ", event " << block.event->m_spEventImpl.get() << "." << std::endl; std::cout << out.str() << std::endl; } } @@ -383,10 +397,11 @@ namespace cms::alpakatools { if (debug_) { std::ostringstream out; - out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << iBlock->second.bytes - << " bytes.\n\t\t " << (cachedBlocks_.size() - 1) << " available blocks cached (" - << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks (" - << cachedBytes_.live << " bytes) outstanding." << std::endl; + out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " + << iBlock->second.bytes << " bytes.\n\t\t " << (cachedBlocks_.size() - 1) + << " available blocks cached (" << cachedBytes_.free << " bytes), " + << liveBlocks_.size() << " live blocks (" << cachedBytes_.live + << " bytes) outstanding." << std::endl; std::cout << out.str() << std::endl; } @@ -395,18 +410,22 @@ namespace cms::alpakatools { } // TODO replace with a tbb::concurrent_multimap ? - using CachedBlocks = std::multimap; // ordered by the allocation bin + using CachedBlocks = + std::multimap; // ordered by the allocation bin // TODO replace with a tbb::concurrent_map ? - using BusyBlocks = std::map; // ordered by the address of the allocated memory + using BusyBlocks = + std::map; // ordered by the address of the allocated memory - inline static const std::string deviceType_ = boost::core::demangle(typeid(Device).name()); + inline static const std::string deviceType_ = + boost::core::demangle(typeid(Device).name()); mutable std::mutex mutex_; Device device_; // the device where the memory is allocated CachedBytes cachedBytes_; CachedBlocks cachedBlocks_; // Set of cached device allocations available for reuse - BusyBlocks liveBlocks_; // map of pointers to the live device allocations currently in use + BusyBlocks + liveBlocks_; // map of pointers to the live device allocations currently in use const unsigned int binGrowth_; // Geometric growth factor for bin-sizes const unsigned int minBin_; diff --git a/CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h b/CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h index a5a163d7..bcc37367 100644 --- a/CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h +++ b/CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h @@ -26,16 +26,22 @@ namespace alpaka { struct Enqueue { using TApi = ApiCudaRt; - static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) { + static void CUDART_CB callback(cudaStream_t /*queue*/, + cudaError_t /*status*/, + void* arg) { //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status); std::unique_ptr pTask(static_cast(arg)); (*pTask)(); } - ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void { + ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) + -> void { auto pTask = std::make_unique(std::move(task)); - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaStreamAddCallback( - alpaka::getNativeHandle(queue), callback, static_cast(pTask.release()), 0u)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + cudaStreamAddCallback(alpaka::getNativeHandle(queue), + callback, + static_cast(pTask.release()), + 0u)); } }; #endif // ALPAKA_ACC_GPU_CUDA_ENABLED @@ -52,10 +58,14 @@ namespace alpaka { (*pTask)(); } - ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void { + ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) + -> void { auto pTask = std::make_unique(std::move(task)); - ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipStreamAddCallback( - alpaka::getNativeHandle(queue), callback, static_cast(pTask.release()), 0u)); + ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK( + hipStreamAddCallback(alpaka::getNativeHandle(queue), + callback, + static_cast(pTask.release()), + 0u)); } }; #endif // ALPAKA_ACC_GPU_HIP_ENABLED diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaConfig.h b/CLUEstering/alpaka/AlpakaCore/alpakaConfig.h index 3f94e9cf..454cee23 100644 --- a/CLUEstering/alpaka/AlpakaCore/alpakaConfig.h +++ b/CLUEstering/alpaka/AlpakaCore/alpakaConfig.h @@ -36,7 +36,8 @@ namespace alpaka_common { // trick to force expanding ALPAKA_ACCELERATOR_NAMESPACE before stringification inside DEFINE_FWK_MODULE #define DEFINE_FWK_ALPAKA_MODULE2(name) DEFINE_FWK_MODULE(name) -#define DEFINE_FWK_ALPAKA_MODULE(name) DEFINE_FWK_ALPAKA_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name) +#define DEFINE_FWK_ALPAKA_MODULE(name) \ + DEFINE_FWK_ALPAKA_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name) #define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE2(name) DEFINE_FWK_EVENTSETUP_MODULE(name) #define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE(name) \ diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaDevices.h b/CLUEstering/alpaka/AlpakaCore/alpakaDevices.h index d980bf38..ca0020ff 100644 --- a/CLUEstering/alpaka/AlpakaCore/alpakaDevices.h +++ b/CLUEstering/alpaka/AlpakaCore/alpakaDevices.h @@ -12,7 +12,8 @@ namespace cms::alpakatools { // alpaka host device - inline const alpaka_common::DevHost host = alpaka::getDevByIdx(0u); + inline const alpaka_common::DevHost host = + alpaka::getDevByIdx(0u); // alpaka accelerator devices template diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaFwd.h b/CLUEstering/alpaka/AlpakaCore/alpakaFwd.h index 2234e440..fbc5c353 100644 --- a/CLUEstering/alpaka/AlpakaCore/alpakaFwd.h +++ b/CLUEstering/alpaka/AlpakaCore/alpakaFwd.h @@ -57,10 +57,14 @@ namespace alpaka { template class QueueUniformCudaHipRt; } - using QueueCudaRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; - using QueueCudaRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; - using QueueHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; - using QueueHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueCudaRtBlocking = + uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueCudaRtNonBlocking = + uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueHipRtBlocking = + uniform_cuda_hip::detail::QueueUniformCudaHipRt; + using QueueHipRtNonBlocking = + uniform_cuda_hip::detail::QueueUniformCudaHipRt; // Events template diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaMemory.h b/CLUEstering/alpaka/AlpakaCore/alpakaMemory.h index 1b151aa0..cb056614 100644 --- a/CLUEstering/alpaka/AlpakaCore/alpakaMemory.h +++ b/CLUEstering/alpaka/AlpakaCore/alpakaMemory.h @@ -86,14 +86,16 @@ namespace cms::alpakatools { } template - std::enable_if_t and not std::is_array_v>, + std::enable_if_t and + not std::is_array_v>, host_buffer> make_host_buffer(Extent extent) { return alpaka::allocBuf, Idx>(host, Vec1D{extent}); } template - std::enable_if_t and not std::is_array_v>, + std::enable_if_t and + not std::is_array_v>, host_buffer> make_host_buffer() { return alpaka::allocBuf, Idx>(host, Vec1D{std::extent_v}); @@ -103,7 +105,8 @@ namespace cms::alpakatools { // the memory is pinned according to the device associated to the queue template - std::enable_if_t, host_buffer> make_host_buffer(TQueue const& queue) { + std::enable_if_t, host_buffer> make_host_buffer( + TQueue const& queue) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf(host, queue, Scalar{}); } else { @@ -112,7 +115,8 @@ namespace cms::alpakatools { } template - std::enable_if_t and not std::is_array_v>, + std::enable_if_t and + not std::is_array_v>, host_buffer> make_host_buffer(TQueue const& queue, Extent extent) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { @@ -124,11 +128,13 @@ namespace cms::alpakatools { } template - std::enable_if_t and not std::is_array_v>, + std::enable_if_t and + not std::is_array_v>, host_buffer> make_host_buffer(TQueue const& queue) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { - return allocCachedBuf, Idx>(host, queue, Vec1D{std::extent_v}); + return allocCachedBuf, Idx>( + host, queue, Vec1D{std::extent_v}); } else { return alpaka::allocMappedBuf, Idx>( host, alpaka::getDev(queue), Vec1D{std::extent_v}); @@ -151,14 +157,18 @@ namespace cms::alpakatools { } template - std::enable_if_t and not std::is_array_v>, + std::enable_if_t and + not std::is_array_v>, host_view> make_host_view(T& data, Extent extent) { - return alpaka::ViewPlainPtr, Dim1D, Idx>(data, host, Vec1D{extent}); + return alpaka::ViewPlainPtr, Dim1D, Idx>( + data, host, Vec1D{extent}); } template - std::enable_if_t and not std::is_array_v>, host_view> + std::enable_if_t and + not std::is_array_v>, + host_view> make_host_view(T& data) { return alpaka::ViewPlainPtr, Dim1D, Idx>( data, host, Vec1D{std::extent_v}); @@ -170,12 +180,13 @@ namespace cms::alpakatools { using device_buffer = typename detail::buffer_type::type; template - std::enable_if_t, device_buffer, T>> make_device_buffer( - TQueue const& queue) { + std::enable_if_t, device_buffer, T>> + make_device_buffer(TQueue const& queue) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf(alpaka::getDev(queue), queue, Scalar{}); } - if constexpr (allocator_policy> == AllocatorPolicy::Asynchronous) { + if constexpr (allocator_policy> == + AllocatorPolicy::Asynchronous) { return alpaka::allocAsyncBuf(queue, Scalar{}); } if constexpr (allocator_policy> == AllocatorPolicy::Synchronous) { @@ -184,33 +195,41 @@ namespace cms::alpakatools { } template - std::enable_if_t and not std::is_array_v>, + std::enable_if_t and + not std::is_array_v>, device_buffer, T>> make_device_buffer(TQueue const& queue, Extent extent) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { - return allocCachedBuf, Idx>(alpaka::getDev(queue), queue, Vec1D{extent}); + return allocCachedBuf, Idx>( + alpaka::getDev(queue), queue, Vec1D{extent}); } - if constexpr (allocator_policy> == AllocatorPolicy::Asynchronous) { + if constexpr (allocator_policy> == + AllocatorPolicy::Asynchronous) { return alpaka::allocAsyncBuf, Idx>(queue, Vec1D{extent}); } if constexpr (allocator_policy> == AllocatorPolicy::Synchronous) { - return alpaka::allocBuf, Idx>(alpaka::getDev(queue), Vec1D{extent}); + return alpaka::allocBuf, Idx>(alpaka::getDev(queue), + Vec1D{extent}); } } template - std::enable_if_t and not std::is_array_v>, + std::enable_if_t and + not std::is_array_v>, device_buffer, T>> make_device_buffer(TQueue const& queue) { if constexpr (allocator_policy> == AllocatorPolicy::Caching) { return allocCachedBuf, Idx>( alpaka::getDev(queue), queue, Vec1D{std::extent_v}); } - if constexpr (allocator_policy> == AllocatorPolicy::Asynchronous) { - return alpaka::allocAsyncBuf, Idx>(queue, Vec1D{std::extent_v}); + if constexpr (allocator_policy> == + AllocatorPolicy::Asynchronous) { + return alpaka::allocAsyncBuf, Idx>(queue, + Vec1D{std::extent_v}); } if constexpr (allocator_policy> == AllocatorPolicy::Synchronous) { - return alpaka::allocBuf, Idx>(alpaka::getDev(queue), Vec1D{std::extent_v}); + return alpaka::allocBuf, Idx>(alpaka::getDev(queue), + Vec1D{std::extent_v}); } } @@ -220,8 +239,8 @@ namespace cms::alpakatools { using device_view = typename detail::view_type::type; template - std::enable_if_t, device_view> make_device_view(TDev const& device, - T& data) { + std::enable_if_t, device_view> make_device_view( + TDev const& device, T& data) { return alpaka::ViewPlainPtr(&data, device, Scalar{}); } @@ -231,14 +250,17 @@ namespace cms::alpakatools { } template - std::enable_if_t and not std::is_array_v>, + std::enable_if_t and + not std::is_array_v>, device_view> make_device_view(TDev const& device, T& data, Extent extent) { - return alpaka::ViewPlainPtr, Dim1D, Idx>(data, device, Vec1D{extent}); + return alpaka::ViewPlainPtr, Dim1D, Idx>( + data, device, Vec1D{extent}); } template - std::enable_if_t and not std::is_array_v>, + std::enable_if_t and + not std::is_array_v>, device_view> make_device_view(TDev const& device, T& data) { return alpaka::ViewPlainPtr, Dim1D, Idx>( diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaWorkDiv.h b/CLUEstering/alpaka/AlpakaCore/alpakaWorkDiv.h index 180f3d7b..d413a29c 100644 --- a/CLUEstering/alpaka/AlpakaCore/alpakaWorkDiv.h +++ b/CLUEstering/alpaka/AlpakaCore/alpakaWorkDiv.h @@ -26,20 +26,24 @@ namespace cms::alpakatools { /* * Return the integer division of the first argument by the second argument, rounded up to the next integer. */ - inline constexpr Idx divide_up_by(Idx value, Idx divisor) { return (value + divisor - 1) / divisor; } + inline constexpr Idx divide_up_by(Idx value, Idx divisor) { + return (value + divisor - 1) / divisor; + } /* * Creates the accelerator-dependent workdiv for 1-dimensional operations. */ template - inline WorkDiv make_workdiv(Idx blocksPerGrid, Idx threadsPerBlockOrElementsPerThread) { + inline WorkDiv make_workdiv(Idx blocksPerGrid, + Idx threadsPerBlockOrElementsPerThread) { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED if constexpr (std::is_same_v>) { // On GPU backends, each thread is looking at a single element: // - threadsPerBlockOrElementsPerThread is the number of threads per block; // - elementsPerThread is always 1. const auto elementsPerThread = Idx{1}; - return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + return WorkDiv( + blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); } else #endif // ALPAKA_ACC_GPU_CUDA_ENABLED #if ALPAKA_ACC_GPU_HIP_ENABLED @@ -48,7 +52,8 @@ namespace cms::alpakatools { // - threadsPerBlockOrElementsPerThread is the number of threads per block; // - elementsPerThread is always 1. const auto elementsPerThread = Idx{1}; - return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + return WorkDiv( + blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); } else #endif // ALPAKA_ACC_GPU_HIP_ENABLED { @@ -56,7 +61,8 @@ namespace cms::alpakatools { // - threadsPerBlock is always 1; // - threadsPerBlockOrElementsPerThread is the number of elements per thread. const auto threadsPerBlock = Idx{1}; - return WorkDiv(blocksPerGrid, threadsPerBlock, threadsPerBlockOrElementsPerThread); + return WorkDiv( + blocksPerGrid, threadsPerBlock, threadsPerBlockOrElementsPerThread); } } @@ -74,7 +80,8 @@ namespace cms::alpakatools { // - threadsPerBlockOrElementsPerThread is the number of threads per block; // - elementsPerThread is always 1. const auto elementsPerThread = Vec::ones(); - return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + return WorkDiv( + blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); } else #endif // ALPAKA_ACC_GPU_CUDA_ENABLED #ifdef ALPAKA_ACC_GPU_HIP_ENABLED @@ -83,7 +90,8 @@ namespace cms::alpakatools { // - threadsPerBlockOrElementsPerThread is the number of threads per block; // - elementsPerThread is always 1. const auto elementsPerThread = Vec::ones(); - return WorkDiv(blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); + return WorkDiv( + blocksPerGrid, threadsPerBlockOrElementsPerThread, elementsPerThread); } else #endif // ALPAKA_ACC_GPU_HIP_ENABLED { @@ -91,7 +99,8 @@ namespace cms::alpakatools { // - threadsPerBlock is always 1; // - threadsPerBlockOrElementsPerThread is the number of elements per thread. const auto threadsPerBlock = Vec::ones(); - return WorkDiv(blocksPerGrid, threadsPerBlock, threadsPerBlockOrElementsPerThread); + return WorkDiv( + blocksPerGrid, threadsPerBlock, threadsPerBlockOrElementsPerThread); } } @@ -104,12 +113,13 @@ namespace cms::alpakatools { * Warning: the max index is not truncated by the max number of elements of interest. */ template - ALPAKA_FN_ACC std::pair element_index_range_in_block(const TAcc& acc, - const Idx elementIdxShift, - const unsigned int dimIndex = 0u) { + ALPAKA_FN_ACC std::pair element_index_range_in_block( + const TAcc& acc, const Idx elementIdxShift, const unsigned int dimIndex = 0u) { // Take into account the thread index in block. - const Idx threadIdxLocal(alpaka::getIdx(acc)[dimIndex]); - const Idx threadDimension(alpaka::getWorkDiv(acc)[dimIndex]); + const Idx threadIdxLocal( + alpaka::getIdx(acc)[dimIndex]); + const Idx threadDimension( + alpaka::getWorkDiv(acc)[dimIndex]); // Compute the elements indexes in block. // Obviously relevant for CPU only. @@ -150,12 +160,12 @@ namespace cms::alpakatools { * Warning: the max index is not truncated by the max number of elements of interest. */ template - ALPAKA_FN_ACC std::pair element_index_range_in_grid(const TAcc& acc, - Idx elementIdxShift, - const unsigned int dimIndex = 0u) { + ALPAKA_FN_ACC std::pair element_index_range_in_grid( + const TAcc& acc, Idx elementIdxShift, const unsigned int dimIndex = 0u) { // Take into account the block index in grid. const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); - const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + const Idx blockDimension( + alpaka::getWorkDiv(acc)[dimIndex]); // Shift to get global indices in grid (instead of local to the block) elementIdxShift += blockIdxInGrid * blockDimension; @@ -169,10 +179,11 @@ namespace cms::alpakatools { * Truncated by the max number of elements of interest. */ template - ALPAKA_FN_ACC std::pair element_index_range_in_grid_truncated(const TAcc& acc, - const Idx maxNumberOfElements, - Idx elementIdxShift, - const unsigned int dimIndex = 0u) { + ALPAKA_FN_ACC std::pair element_index_range_in_grid_truncated( + const TAcc& acc, + const Idx maxNumberOfElements, + Idx elementIdxShift, + const unsigned int dimIndex = 0u) { // Check dimension //static_assert(dimIndex <= alpaka::Dim::value, //"Accelerator and maxNumberOfElements need to have same dimension."); @@ -194,7 +205,8 @@ namespace cms::alpakatools { ALPAKA_FN_ACC std::pair element_index_range_in_grid_truncated( const TAcc& acc, const Idx maxNumberOfElements, const unsigned int dimIndex = 0u) { Idx elementIdxShift = 0u; - return element_index_range_in_grid_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex); + return element_index_range_in_grid_truncated( + acc, maxNumberOfElements, elementIdxShift, dimIndex); } /********************************************* @@ -212,8 +224,8 @@ namespace cms::alpakatools { const Idx elementIdxShift, const Func func, const unsigned int dimIndex = 0) { - const auto& [firstElementIdx, endElementIdx] = - element_index_range_in_block_truncated(acc, maxNumberOfElements, elementIdxShift, dimIndex); + const auto& [firstElementIdx, endElementIdx] = element_index_range_in_block_truncated( + acc, maxNumberOfElements, elementIdxShift, dimIndex); for (Idx elementIdx = firstElementIdx; elementIdx < endElementIdx; ++elementIdx) { func(elementIdx); @@ -245,7 +257,8 @@ namespace cms::alpakatools { const unsigned int dimIndex = 0) { // Take into account the block index in grid to compute the element indices. const Idx blockIdxInGrid(alpaka::getIdx(acc)[dimIndex]); - const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + const Idx blockDimension( + alpaka::getWorkDiv(acc)[dimIndex]); elementIdxShift += blockIdxInGrid * blockDimension; for_each_element_in_block(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); @@ -284,7 +297,8 @@ namespace cms::alpakatools { element_index_range_in_block(acc, elementIdxShift, dimIndex); // Stride = block size. - const Idx blockDimension(alpaka::getWorkDiv(acc)[dimIndex]); + const Idx blockDimension( + alpaka::getWorkDiv(acc)[dimIndex]); // Strided access. for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride; @@ -309,7 +323,8 @@ namespace cms::alpakatools { const Func func, const unsigned int dimIndex = 0) { const Idx elementIdxShift = 0; - for_each_element_in_block_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + for_each_element_in_block_strided( + acc, maxNumberOfElements, elementIdxShift, func, dimIndex); } /* @@ -329,7 +344,8 @@ namespace cms::alpakatools { element_index_range_in_grid(acc, elementIdxShift, dimIndex); // Stride = grid size. - const Idx gridDimension(alpaka::getWorkDiv(acc)[dimIndex]); + const Idx gridDimension( + alpaka::getWorkDiv(acc)[dimIndex]); // Strided access. for (Idx threadIdx = firstElementIdxNoStride, endElementIdx = endElementIdxNoStride; @@ -354,7 +370,8 @@ namespace cms::alpakatools { const Func func, const unsigned int dimIndex = 0) { const Idx elementIdxShift = 0; - for_each_element_in_grid_strided(acc, maxNumberOfElements, elementIdxShift, func, dimIndex); + for_each_element_in_grid_strided( + acc, maxNumberOfElements, elementIdxShift, func, dimIndex); } /************************************************************** @@ -372,7 +389,11 @@ namespace cms::alpakatools { * NB 2: Modifies i, firstElementIdx and endElementIdx. */ ALPAKA_FN_ACC ALPAKA_FN_INLINE bool next_valid_element_index_strided( - Idx& i, Idx& firstElementIdx, Idx& endElementIdx, const Idx stride, const Idx maxNumberOfElements) { + Idx& i, + Idx& firstElementIdx, + Idx& endElementIdx, + const Idx stride, + const Idx maxNumberOfElements) { bool isNextStrideElementValid = true; if (i == endElementIdx) { firstElementIdx += stride; diff --git a/CLUEstering/alpaka/AlpakaCore/backend.h b/CLUEstering/alpaka/AlpakaCore/backend.h index 8d58953c..053c3545 100644 --- a/CLUEstering/alpaka/AlpakaCore/backend.h +++ b/CLUEstering/alpaka/AlpakaCore/backend.h @@ -4,7 +4,8 @@ enum class Backend { SERIAL, TBB, CUDA, HIP }; inline std::string const& name(Backend backend) { - static const std::string names[] = {"serial_sync", "tbb_async", "cuda_async", "rocm_async"}; + static const std::string names[] = { + "serial_sync", "tbb_async", "cuda_async", "rocm_async"}; return names[static_cast(backend)]; } diff --git a/CLUEstering/alpaka/AlpakaCore/getDeviceCachingAllocator.h b/CLUEstering/alpaka/AlpakaCore/getDeviceCachingAllocator.h index 69aca818..6372d497 100644 --- a/CLUEstering/alpaka/AlpakaCore/getDeviceCachingAllocator.h +++ b/CLUEstering/alpaka/AlpakaCore/getDeviceCachingAllocator.h @@ -50,7 +50,8 @@ namespace cms::alpakatools { } // namespace detail template - inline CachingAllocator& getDeviceCachingAllocator(TDevice const& device) { + inline CachingAllocator& getDeviceCachingAllocator( + TDevice const& device) { // initialise all allocators, one per device static auto allocators = detail::allocate_device_allocators(); diff --git a/CLUEstering/alpaka/AlpakaCore/getDeviceIndex.h b/CLUEstering/alpaka/AlpakaCore/getDeviceIndex.h index 5abbeaa0..b26efcf9 100644 --- a/CLUEstering/alpaka/AlpakaCore/getDeviceIndex.h +++ b/CLUEstering/alpaka/AlpakaCore/getDeviceIndex.h @@ -16,12 +16,16 @@ namespace cms::alpakatools { #ifdef ALPAKA_ACC_GPU_CUDA_ENABLED // overload for DevCudaRt - inline int getDeviceIndex(alpaka::DevCudaRt const& device) { return alpaka::getNativeHandle(device); } + inline int getDeviceIndex(alpaka::DevCudaRt const& device) { + return alpaka::getNativeHandle(device); + } #endif // ALPAKA_ACC_GPU_CUDA_ENABLED #ifdef ALPAKA_ACC_GPU_HIP_ENABLED // overload for DevHipRt - inline int getDeviceIndex(alpaka::DevHipRt const& device) { return alpaka::getNativeHandle(device); } + inline int getDeviceIndex(alpaka::DevHipRt const& device) { + return alpaka::getNativeHandle(device); + } #endif // ALPAKA_ACC_GPU_HIP_ENABLED } // namespace cms::alpakatools diff --git a/CLUEstering/alpaka/AlpakaCore/getHostCachingAllocator.h b/CLUEstering/alpaka/AlpakaCore/getHostCachingAllocator.h index 3103a7bf..2cf82cba 100644 --- a/CLUEstering/alpaka/AlpakaCore/getHostCachingAllocator.h +++ b/CLUEstering/alpaka/AlpakaCore/getHostCachingAllocator.h @@ -10,14 +10,15 @@ namespace cms::alpakatools { template inline CachingAllocator& getHostCachingAllocator() { // thread safe initialisation of the host allocator - static CachingAllocator allocator(host, - config::binGrowth, - config::minBin, - config::maxBin, - config::maxCachedBytes, - config::maxCachedFraction, - false, // reuseSameQueueAllocations - false); // debug + static CachingAllocator allocator( + host, + config::binGrowth, + config::minBin, + config::maxBin, + config::maxCachedBytes, + config::maxCachedFraction, + false, // reuseSameQueueAllocations + false); // debug // the public interface is thread safe return allocator; diff --git a/CLUEstering/alpaka/BindingModules/binding_cpu.cc b/CLUEstering/alpaka/BindingModules/binding_cpu.cc index a870d6f4..5ba368e0 100644 --- a/CLUEstering/alpaka/BindingModules/binding_cpu.cc +++ b/CLUEstering/alpaka/BindingModules/binding_cpu.cc @@ -44,26 +44,36 @@ namespace alpaka_serial_sync { // Running the clustering algorithm // switch (Ndim) { - [[unlikely]] case (1) : - return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[likely]] case (2) : - return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[likely]] case (3) : - return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (4) : - return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (5) : - return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (6) : - return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (7) : - return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (8) : - return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (9) : - return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (10) : - return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (1): + return run1( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2): + return run2( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3): + return run3( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4): + return run4( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5): + return run5( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6): + return run6( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7): + return run7( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8): + return run8( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9): + return run9( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10): + return run10( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); [[unlikely]] default: std::cout << "This library only works up to 10 dimensions\n"; return {}; @@ -87,26 +97,36 @@ namespace alpaka_serial_sync { // Running the clustering algorithm // switch (Ndim) { - [[unlikely]] case (1) : - return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[likely]] case (2) : - return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[likely]] case (3) : - return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (4) : - return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (5) : - return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (6) : - return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (7) : - return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (8) : - return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (9) : - return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (10) : - return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (1): + return run1( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2): + return run2( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3): + return run3( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4): + return run4( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5): + return run5( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6): + return run6( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7): + return run7( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8): + return run8( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9): + return run9( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10): + return run10( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); [[unlikely]] default: std::cout << "This library only works up to 10 dimensions\n"; return {}; @@ -130,26 +150,36 @@ namespace alpaka_serial_sync { // Running the clustering algorithm // switch (Ndim) { - [[unlikely]] case (1) : - return run1(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[likely]] case (2) : - return run2(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[likely]] case (3) : - return run3(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (4) : - return run4(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (5) : - return run5(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (6) : - return run6(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (7) : - return run7(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (8) : - return run8(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (9) : - return run9(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); - [[unlikely]] case (10) : - return run10(dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (1): + return run1( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (2): + return run2( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[likely]] case (3): + return run3( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (4): + return run4( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (5): + return run5( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (6): + return run6( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (7): + return run7( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (8): + return run8( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (9): + return run9( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); + [[unlikely]] case (10): + return run10( + dc, rhoc, outlier, pPBin, coords, weights, kernel, queue_, block_size); [[unlikely]] default: std::cout << "This library only works up to 10 dimensions\n"; return {}; @@ -159,7 +189,9 @@ namespace alpaka_serial_sync { PYBIND11_MODULE(CLUE_CPU_Serial, m) { m.doc() = "Binding of the CLUE algorithm running serially on CPU"; - m.def("listDevices", &listDevices, "List the available devices for the CPU serial backend"); + m.def("listDevices", + &listDevices, + "List the available devices for the CPU serial backend"); m.def("mainRun", pybind11::overload_cast(m, "FlatKernel").def(pybind11::init()); - pybind11::class_(m, "ExponentialKernel").def(pybind11::init()); - pybind11::class_(m, "GaussianKernel").def(pybind11::init()); + pybind11::class_(m, "ExponentialKernel") + .def(pybind11::init()); + pybind11::class_(m, "GaussianKernel") + .def(pybind11::init()); } diff --git a/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h b/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h index 356233dd..8600ecd0 100644 --- a/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h +++ b/CLUEstering/alpaka/CLUE/CLUEAlgoAlpaka.h @@ -28,8 +28,12 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { class CLUEAlgoAlpaka { public: CLUEAlgoAlpaka() = delete; - explicit CLUEAlgoAlpaka(float dc, float rhoc, float outlierDeltaFactor, int pPBin, Queue queue_) - : dc_{dc}, rhoc_{rhoc}, outlierDeltaFactor_{outlierDeltaFactor}, pointsPerTile_{pPBin} { + explicit CLUEAlgoAlpaka( + float dc, float rhoc, float outlierDeltaFactor, int pPBin, Queue queue_) + : dc_{dc}, + rhoc_{rhoc}, + outlierDeltaFactor_{outlierDeltaFactor}, + pointsPerTile_{pPBin} { init_device(queue_); } @@ -55,15 +59,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // Buffers std::optional>> d_tiles; - std::optional>> - d_seeds; std::optional< - cms::alpakatools::device_buffer[]>> + cms::alpakatools::device_buffer>> + d_seeds; + std::optional[]>> d_followers; // Private methods void init_device(Queue queue_); - void setup(const Points& h_points, PointsAlpaka& d_points, Queue queue_, size_t block_size); + void setup(const Points& h_points, + PointsAlpaka& d_points, + Queue queue_, + size_t block_size); // Construction of the tiles void calculate_tile_size(TilesAlpaka& h_tiles, const Points& h_points); @@ -75,8 +85,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const Points& h_points) { for (int i{}; i != Ndim; ++i) { float tileSize; - float dimMax{*std::max_element(h_points.m_coords[i].begin(), h_points.m_coords[i].end())}; - float dimMin{*std::min_element(h_points.m_coords[i].begin(), h_points.m_coords[i].end())}; + float dimMax{ + *std::max_element(h_points.m_coords[i].begin(), h_points.m_coords[i].end())}; + float dimMin{ + *std::min_element(h_points.m_coords[i].begin(), h_points.m_coords[i].end())}; VecArray temp; temp.push_back_unsafe(dimMin); @@ -91,10 +103,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { template void CLUEAlgoAlpaka::init_device(Queue queue_) { d_tiles = cms::alpakatools::make_device_buffer>(queue_); - d_seeds = cms::alpakatools::make_device_buffer>(queue_); - d_followers = - cms::alpakatools::make_device_buffer[]>(queue_, - reserve); + d_seeds = cms::alpakatools::make_device_buffer< + cms::alpakatools::VecArray>(queue_); + d_followers = cms::alpakatools::make_device_buffer< + cms::alpakatools::VecArray[]>(queue_, reserve); // Copy to the public pointers m_seeds = (*d_seeds).data(); @@ -114,34 +126,40 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::memcpy(queue_, *d_tiles, cms::alpakatools::make_host_view(temp)); m_tiles = (*d_tiles).data(); alpaka::memcpy( - queue_, d_points.coords, cms::alpakatools::make_host_view(h_points.m_coords.data(), h_points.n)); + queue_, + d_points.coords, + cms::alpakatools::make_host_view(h_points.m_coords.data(), h_points.n)); alpaka::memcpy( - queue_, d_points.weight, cms::alpakatools::make_host_view(h_points.m_weight.data(), h_points.n)); + queue_, + d_points.weight, + cms::alpakatools::make_host_view(h_points.m_weight.data(), h_points.n)); alpaka::memset(queue_, (*d_seeds), 0x00); // Define the working division Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, block_size); auto working_div = cms::alpakatools::make_workdiv(grid_size, block_size); - alpaka::enqueue( - queue_, - alpaka::createTaskKernel(working_div, KernelResetFollowers{}, m_followers, h_points.n)); + alpaka::enqueue(queue_, + alpaka::createTaskKernel( + working_div, KernelResetFollowers{}, m_followers, h_points.n)); } // Public methods template template - std::vector> CLUEAlgoAlpaka::make_clusters(Points& h_points, - PointsAlpaka& d_points, - const KernelType& kernel, - Queue queue_, - size_t block_size) { + std::vector> CLUEAlgoAlpaka::make_clusters( + Points& h_points, + PointsAlpaka& d_points, + const KernelType& kernel, + Queue queue_, + size_t block_size) { setup(h_points, d_points, queue_, block_size); const Idx grid_size = cms::alpakatools::divide_up_by(h_points.n, block_size); auto working_div = cms::alpakatools::make_workdiv(grid_size, block_size); - alpaka::enqueue(queue_, - alpaka::createTaskKernel( - working_div, KernelFillTiles(), d_points.view(), m_tiles, h_points.n)); + alpaka::enqueue( + queue_, + alpaka::createTaskKernel( + working_div, KernelFillTiles(), d_points.view(), m_tiles, h_points.n)); alpaka::enqueue(queue_, alpaka::createTaskKernel(working_div, @@ -174,11 +192,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // We change the working division when assigning the clusters const Idx grid_size_seeds = cms::alpakatools::divide_up_by(max_seeds, block_size); - auto working_div_seeds = cms::alpakatools::make_workdiv(grid_size_seeds, block_size); - alpaka::enqueue( - queue_, - alpaka::createTaskKernel( - working_div_seeds, KernelAssignClusters(), m_seeds, m_followers, d_points.view())); + auto working_div_seeds = + cms::alpakatools::make_workdiv(grid_size_seeds, block_size); + alpaka::enqueue(queue_, + alpaka::createTaskKernel(working_div_seeds, + KernelAssignClusters(), + m_seeds, + m_followers, + d_points.view())); // Wait for all the operations in the queue to finish alpaka::wait(queue_); @@ -191,14 +212,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { cms::alpakatools::make_host_view(h_points.m_delta.data(), h_points.n), d_points.delta, static_cast(h_points.n)); - alpaka::memcpy(queue_, - cms::alpakatools::make_host_view(h_points.m_nearestHigher.data(), h_points.n), - d_points.nearest_higher, - static_cast(h_points.n)); - alpaka::memcpy(queue_, - cms::alpakatools::make_host_view(h_points.m_clusterIndex.data(), h_points.n), - d_points.cluster_index, - static_cast(h_points.n)); + alpaka::memcpy( + queue_, + cms::alpakatools::make_host_view(h_points.m_nearestHigher.data(), h_points.n), + d_points.nearest_higher, + static_cast(h_points.n)); + alpaka::memcpy( + queue_, + cms::alpakatools::make_host_view(h_points.m_clusterIndex.data(), h_points.n), + d_points.cluster_index, + static_cast(h_points.n)); alpaka::memcpy(queue_, cms::alpakatools::make_host_view(h_points.m_isSeed.data(), h_points.n), d_points.is_seed, diff --git a/CLUEstering/alpaka/CLUE/CLUEAlpakaKernels.h b/CLUEstering/alpaka/CLUE/CLUEAlpakaKernels.h index 660ccc08..66161917 100644 --- a/CLUEstering/alpaka/CLUE/CLUEAlpakaKernels.h +++ b/CLUEstering/alpaka/CLUE/CLUEAlpakaKernels.h @@ -43,17 +43,18 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { }; template - ALPAKA_FN_HOST_ACC void for_recursion(const TAcc& acc, - VecArray& base_vec, - const VecArray, Ndim>& search_box, - TilesAlpaka* tiles, - PointsView* dev_points, - const KernelType& kernel, - /* const VecArray, Ndim>& domains, */ - const VecArray& coords_i, - float* rho_i, - float dc, - uint32_t point_id) { + ALPAKA_FN_HOST_ACC void for_recursion( + const TAcc& acc, + VecArray& base_vec, + const VecArray, Ndim>& search_box, + TilesAlpaka* tiles, + PointsView* dev_points, + const KernelType& kernel, + /* const VecArray, Ndim>& domains, */ + const VecArray& coords_i, + float* rho_i, + float dc, + uint32_t point_id) { if constexpr (N_ == 0) { int binId{tiles->getGlobalBinByBin(acc, base_vec)}; // get the size of this bin @@ -71,7 +72,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } if (dist_ij_sq <= dc * dc) { - *rho_i += kernel(acc, alpaka::math::sqrt(acc, dist_ij_sq), point_id, j) * dev_points->weight[j]; + *rho_i += kernel(acc, alpaka::math::sqrt(acc, dist_ij_sq), point_id, j) * + dev_points->weight[j]; } } // end of interate inside this bin @@ -82,8 +84,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { i <= search_box[search_box.capacity() - N_][1]; ++i) { base_vec[base_vec.capacity() - N_] = i; - for_recursion( - acc, base_vec, search_box, tiles, dev_points, kernel, coords_i, rho_i, dc, point_id); + for_recursion(acc, + base_vec, + search_box, + tiles, + dev_points, + kernel, + coords_i, + rho_i, + dc, + point_id); } } } @@ -116,8 +126,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { dev_tiles->searchBox(acc, searchbox_extremes, &search_box); VecArray base_vec; - for_recursion( - acc, base_vec, search_box, dev_tiles, dev_points, kernel, coords_i, &rho_i, dc, i); + for_recursion(acc, + base_vec, + search_box, + dev_tiles, + dev_points, + kernel, + coords_i, + &rho_i, + dc, + i); dev_points->rho[i] = rho_i; }); @@ -150,7 +168,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rho_j{dev_points->rho[j]}; bool found_higher{(rho_j > rho_i)}; // in the rare case where rho is the same, use detid - found_higher = found_higher || ((rho_j == rho_i) && (rho_j > 0.f) && (j > point_id)); + found_higher = + found_higher || ((rho_j == rho_i) && (rho_j > 0.f) && (j > point_id)); // Calculate the distance between the two points VecArray coords_j{dev_points->coords[j]}; @@ -171,10 +190,21 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { return; } else { - for (unsigned int i{s_box[s_box.capacity() - N_][0]}; i <= s_box[s_box.capacity() - N_][1]; ++i) { + for (unsigned int i{s_box[s_box.capacity() - N_][0]}; + i <= s_box[s_box.capacity() - N_][1]; + ++i) { base_vec[base_vec.capacity() - N_] = i; - for_recursion_nearest_higher( - acc, base_vec, s_box, tiles, dev_points, coords_i, rho_i, delta_i, nh_i, dm_sq, point_id); + for_recursion_nearest_higher(acc, + base_vec, + s_box, + tiles, + dev_points, + coords_i, + rho_i, + delta_i, + nh_i, + dm_sq, + point_id); } } } diff --git a/CLUEstering/alpaka/CLUE/ConvolutionalKernel.h b/CLUEstering/alpaka/CLUE/ConvolutionalKernel.h index 2e6eb3c6..1deba188 100644 --- a/CLUEstering/alpaka/CLUE/ConvolutionalKernel.h +++ b/CLUEstering/alpaka/CLUE/ConvolutionalKernel.h @@ -21,7 +21,10 @@ class FlatKernel { // Overload call operator template - ALPAKA_FN_HOST_ACC float operator()(const TAcc& acc, float dist_ij, int point_id, int j) const { + ALPAKA_FN_HOST_ACC float operator()(const TAcc& acc, + float dist_ij, + int point_id, + int j) const { if (point_id == j) { return 1.f; } else { @@ -44,13 +47,17 @@ class GaussianKernel { // Overload call operator template - ALPAKA_FN_HOST_ACC float operator()(const TAcc& acc, float dist_ij, int point_id, int j) const { + ALPAKA_FN_HOST_ACC float operator()(const TAcc& acc, + float dist_ij, + int point_id, + int j) const { if (point_id == j) { return 1.f; } else { return (m_gaus_amplitude * - alpaka::math::exp( - acc, -(dist_ij - m_gaus_avg) * (dist_ij - m_gaus_avg) / (2 * m_gaus_std * m_gaus_std))); + alpaka::math::exp(acc, + -(dist_ij - m_gaus_avg) * (dist_ij - m_gaus_avg) / + (2 * m_gaus_std * m_gaus_std))); } } }; @@ -68,7 +75,10 @@ class ExponentialKernel { // Overload call operator template - ALPAKA_FN_HOST_ACC float operator()(const TAcc& acc, float dist_ij, int point_id, int j) const { + ALPAKA_FN_HOST_ACC float operator()(const TAcc& acc, + float dist_ij, + int point_id, + int j) const { if (point_id == j) { return 1.f; } else { diff --git a/CLUEstering/alpaka/CLUE/Run.h b/CLUEstering/alpaka/CLUE/Run.h index cf1502fc..2080fd25 100644 --- a/CLUEstering/alpaka/CLUE/Run.h +++ b/CLUEstering/alpaka/CLUE/Run.h @@ -11,9 +11,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const FlatKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const FlatKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -29,9 +29,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const ExponentialKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const ExponentialKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -47,9 +47,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const GaussianKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const GaussianKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -65,9 +65,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const FlatKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const FlatKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -83,9 +83,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const ExponentialKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const ExponentialKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -101,9 +101,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const GaussianKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const GaussianKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -119,9 +119,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const FlatKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const FlatKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -137,9 +137,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const ExponentialKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const ExponentialKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -155,9 +155,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const GaussianKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const GaussianKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -173,9 +173,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const FlatKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const FlatKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -191,9 +191,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const ExponentialKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const ExponentialKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -209,9 +209,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const GaussianKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const GaussianKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -227,9 +227,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const FlatKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const FlatKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -245,9 +245,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const ExponentialKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const ExponentialKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -263,9 +263,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const GaussianKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const GaussianKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -281,9 +281,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const FlatKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const FlatKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -299,9 +299,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const ExponentialKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const ExponentialKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -317,9 +317,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const GaussianKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const GaussianKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -335,9 +335,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const FlatKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const FlatKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -353,9 +353,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const ExponentialKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const ExponentialKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -371,9 +371,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const GaussianKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const GaussianKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -389,9 +389,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const FlatKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const FlatKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -407,9 +407,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const ExponentialKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const ExponentialKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -425,9 +425,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const GaussianKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const GaussianKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -443,9 +443,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const FlatKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const FlatKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -461,9 +461,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const ExponentialKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const ExponentialKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -479,9 +479,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const GaussianKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const GaussianKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -497,9 +497,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const FlatKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const FlatKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -515,9 +515,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const ExponentialKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const ExponentialKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); @@ -533,9 +533,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { float rhoc, float outlier, int pPBin, - std::vector> const &coordinates, - std::vector const &weight, - const GaussianKernel &kernel, + std::vector> const& coordinates, + std::vector const& weight, + const GaussianKernel& kernel, Queue queue_, size_t block_size) { CLUEAlgoAlpaka algo(dc, rhoc, outlier, pPBin, queue_); diff --git a/CLUEstering/alpaka/DataFormats/Points.h b/CLUEstering/alpaka/DataFormats/Points.h index 665802dc..9dc05dbb 100644 --- a/CLUEstering/alpaka/DataFormats/Points.h +++ b/CLUEstering/alpaka/DataFormats/Points.h @@ -12,10 +12,11 @@ using cms::alpakatools::VecArray; -template struct Points { +template +struct Points { Points() = default; - Points(const std::vector> &coords, - const std::vector &weight) + Points(const std::vector>& coords, + const std::vector& weight) : m_coords{coords}, m_weight{weight}, n{weight.size()} { m_rho.resize(n); m_delta.resize(n); @@ -23,10 +24,9 @@ template struct Points { m_clusterIndex.resize(n); m_isSeed.resize(n); } - Points(const std::vector> &coords, - const std::vector &weight) + Points(const std::vector>& coords, const std::vector& weight) : m_weight{weight}, n{weight.size()} { - for (const auto &x : coords) { + for (const auto& x : coords) { VecArray temp_vecarray; for (auto value : x) { temp_vecarray.push_back_unsafe(value); diff --git a/CLUEstering/alpaka/DataFormats/alpaka/AlpakaVecArray.h b/CLUEstering/alpaka/DataFormats/alpaka/AlpakaVecArray.h index f2e3ed95..eeca9547 100644 --- a/CLUEstering/alpaka/DataFormats/alpaka/AlpakaVecArray.h +++ b/CLUEstering/alpaka/DataFormats/alpaka/AlpakaVecArray.h @@ -9,7 +9,7 @@ namespace cms::alpakatools { template struct VecArray { - inline constexpr int push_back_unsafe(const T &element) { + inline constexpr int push_back_unsafe(const T& element) { auto previousSize = m_size; m_size++; if (previousSize < maxSize) { @@ -22,7 +22,7 @@ namespace cms::alpakatools { } template - constexpr int emplace_back_unsafe(Ts &&...args) { + constexpr int emplace_back_unsafe(Ts&&... args) { auto previousSize = m_size; m_size++; if (previousSize < maxSize) { @@ -34,7 +34,7 @@ namespace cms::alpakatools { } } - inline constexpr T &back() const { + inline constexpr T& back() const { if (m_size > 0) { return m_data[m_size - 1]; } else @@ -43,7 +43,7 @@ namespace cms::alpakatools { // thread-safe version of the vector, when used in a CUDA kernel template - ALPAKA_FN_ACC int push_back(const T_Acc &acc, const T &element) { + ALPAKA_FN_ACC int push_back(const T_Acc& acc, const T& element) { auto previousSize = atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); if (previousSize < maxSize) { m_data[previousSize] = element; @@ -56,7 +56,7 @@ namespace cms::alpakatools { } template - ALPAKA_FN_ACC int emplace_back(const T_Acc &acc, Ts &&...args) { + ALPAKA_FN_ACC int emplace_back(const T_Acc& acc, Ts&&... args) { auto previousSize = atomicAdd(acc, &m_size, 1, alpaka::hierarchy::Blocks{}); if (previousSize < maxSize) { (new (&m_data[previousSize]) T(std::forward(args)...)); @@ -76,16 +76,16 @@ namespace cms::alpakatools { return T(); } - inline constexpr T const *begin() const { return m_data; } - inline constexpr T const *end() const { return m_data + m_size; } - inline constexpr T *begin() { return m_data; } - inline constexpr T *end() { return m_data + m_size; } + inline constexpr T const* begin() const { return m_data; } + inline constexpr T const* end() const { return m_data + m_size; } + inline constexpr T* begin() { return m_data; } + inline constexpr T* end() { return m_data + m_size; } inline constexpr int size() const { return m_size; } - inline constexpr T &operator[](int i) { return m_data[i]; } - inline constexpr const T &operator[](int i) const { return m_data[i]; } + inline constexpr T& operator[](int i) { return m_data[i]; } + inline constexpr const T& operator[](int i) const { return m_data[i]; } inline constexpr void reset() { m_size = 0; } inline constexpr int capacity() const { return maxSize; } - inline constexpr T const *data() const { return m_data; } + inline constexpr T const* data() const { return m_data; } inline constexpr void resize(int size) { m_size = size; } inline constexpr bool empty() const { return 0 == m_size; } inline constexpr bool full() const { return maxSize == m_size; } diff --git a/CLUEstering/alpaka/DataFormats/alpaka/PointsAlpaka.h b/CLUEstering/alpaka/DataFormats/alpaka/PointsAlpaka.h index 9fe71ef4..ec8347f1 100644 --- a/CLUEstering/alpaka/DataFormats/alpaka/PointsAlpaka.h +++ b/CLUEstering/alpaka/DataFormats/alpaka/PointsAlpaka.h @@ -18,7 +18,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { public: PointsAlpaka() = delete; explicit PointsAlpaka(Queue stream, int n_points) - : coords{cms::alpakatools::make_device_buffer[]>(stream, n_points)}, + : coords{cms::alpakatools::make_device_buffer[]>(stream, + n_points)}, weight{cms::alpakatools::make_device_buffer(stream, n_points)}, rho{cms::alpakatools::make_device_buffer(stream, n_points)}, delta{cms::alpakatools::make_device_buffer(stream, n_points)}, diff --git a/CLUEstering/alpaka/DataFormats/alpaka/TilesAlpaka.h b/CLUEstering/alpaka/DataFormats/alpaka/TilesAlpaka.h index 9f5812a1..7764e414 100644 --- a/CLUEstering/alpaka/DataFormats/alpaka/TilesAlpaka.h +++ b/CLUEstering/alpaka/DataFormats/alpaka/TilesAlpaka.h @@ -28,7 +28,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { template class TilesAlpaka { public: - TilesAlpaka() : n_tiles{1000}, n_tiles_per_dim{static_cast(std::pow(1000, 1. / Ndim))} {}; + TilesAlpaka() + : n_tiles{1000}, n_tiles_per_dim{static_cast(std::pow(1000, 1. / Ndim))} {}; // Public member VecArray, Ndim> min_max; @@ -41,7 +42,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { int nPerDim() const { return n_tiles_per_dim; } template - ALPAKA_FN_HOST_ACC inline constexpr int getBin(const TAcc& acc, float coord_, int dim_) const { + ALPAKA_FN_HOST_ACC inline constexpr int getBin(const TAcc& acc, + float coord_, + int dim_) const { int coord_Bin{(int)((coord_ - min_max[dim_][0]) / tile_size[dim_])}; // Address the cases of underflow and overflow and underflow @@ -52,8 +55,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } template - ALPAKA_FN_HOST_ACC inline constexpr int getGlobalBin(const TAcc& acc, - const VecArray& coords) const { + ALPAKA_FN_HOST_ACC inline constexpr int getGlobalBin( + const TAcc& acc, const VecArray& coords) const { int globalBin{getBin(acc, coords[0], 0)}; for (int i{1}; i != Ndim; ++i) { globalBin += n_tiles_per_dim * getBin(acc, coords[i], i); @@ -62,8 +65,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } template - ALPAKA_FN_HOST_ACC inline constexpr int getGlobalBinByBin(const TAcc& acc, - const VecArray& Bins) const { + ALPAKA_FN_HOST_ACC inline constexpr int getGlobalBinByBin( + const TAcc& acc, const VecArray& Bins) const { uint32_t globalBin{Bins[0]}; for (int i{1}; i != Ndim; ++i) { globalBin += n_tiles_per_dim * Bins[i]; @@ -72,14 +75,17 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } template - ALPAKA_FN_ACC inline constexpr void fill(const TAcc& acc, const VecArray& coords, int i) { + ALPAKA_FN_ACC inline constexpr void fill(const TAcc& acc, + const VecArray& coords, + int i) { m_tiles[getGlobalBin(acc, coords)].push_back(acc, i); } template - ALPAKA_FN_ACC inline void searchBox(const TAcc& acc, - const VecArray, Ndim>& sb_extremes, - VecArray, Ndim>* search_box) { + ALPAKA_FN_ACC inline void searchBox( + const TAcc& acc, + const VecArray, Ndim>& sb_extremes, + VecArray, Ndim>* search_box) { for (int dim{}; dim != Ndim; ++dim) { VecArray dim_sb; dim_sb.push_back_unsafe(getBin(acc, sb_extremes[dim][0], dim)); @@ -97,7 +103,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } } - ALPAKA_FN_HOST_ACC inline constexpr VecArray& operator[](int globalBinId) { + ALPAKA_FN_HOST_ACC inline constexpr VecArray& operator[]( + int globalBinId) { return m_tiles[globalBinId]; }