cms-patatrack · sbaldu · Mar 19, 2024 · Mar 19, 2024 · Mar 19, 2024 · Mar 19, 2024
diff --git a/.clang-format b/.clang-format
@@ -1,7 +1,7 @@
 ---
 Language:        Cpp
 BasedOnStyle:  Google
-ColumnLimit:     108
+ColumnLimit:     90
 NamespaceIndentation: All
 SortIncludes:    false
 IndentWidth:     2
@@ -11,8 +11,10 @@ PenaltyExcessCharacter: 100
 AlignAfterOpenBracket: Align
 AllowShortIfStatementsOnASingleLine: false
 AllowShortLoopsOnASingleLine: false
-BinPackParameters: false
 AlwaysBreakTemplateDeclarations: Yes
 ReflowComments: false
 BinPackArguments: false
 BinPackParameters: false
+DerivePointerAlignment: false
+PointerAlignment: Left
+ReferenceAlignment: Left
diff --git a/.github/workflows/clang_format.yml b/.github/workflows/clang_format.yml
@@ -19,6 +19,6 @@ jobs:
     - name: Run clang-format style check
       uses: jidicula/[email protected]
       with:
-        clang-format-version: '16'
+        clang-format-version: '17'
         check-path: ${{ matrix.path }}
         exclude-regex: 'CLUEstering/include/test/doctest.h'
diff --git a/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h b/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h
@@ -14,7 +14,8 @@ namespace cms::alpakatools {
   template <typename TDev>
   constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous;
 
-#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || \
+    defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
   template <>
   constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCpu> =
 #if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR

diff --git a/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h b/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h
@@ -39,7 +39,12 @@ namespace cms::alpakatools {
 
     //! The caching memory allocator implementation for the pinned host memory
     template <typename TElem, typename TDim, typename TIdx>
-    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtNonBlocking, void> {
+    struct CachedBufAlloc<TElem,
+                          TDim,
+                          TIdx,
+                          alpaka::DevCpu,
+                          alpaka::QueueCudaRtNonBlocking,
+                          void> {
       template <typename TExtent>
       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
                                                 alpaka::QueueCudaRtNonBlocking queue,
@@ -96,7 +101,12 @@ namespace cms::alpakatools {
 
     //! The caching memory allocator implementation for the pinned host memory
     template <typename TElem, typename TDim, typename TIdx>
-    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtNonBlocking, void> {
+    struct CachedBufAlloc<TElem,
+                          TDim,
+                          TIdx,
+                          alpaka::DevCpu,
+                          alpaka::QueueHipRtNonBlocking,
+                          void> {
       template <typename TExtent>
       ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
                                                 alpaka::QueueHipRtNonBlocking queue,
@@ -152,9 +162,11 @@ namespace cms::alpakatools {
   }  // namespace traits
 
   template <typename TElem, typename TIdx, typename TExtent, typename TQueue, typename TDev>
-  ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) {
-    return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::allocCachedBuf(
-        dev, queue, extent);
+  ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev,
+                                     TQueue queue,
+                                     TExtent const& extent = TExtent()) {
+    return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::
+        allocCachedBuf(dev, queue, extent);
   }
 
 }  // namespace cms::alpakatools

diff --git a/CLUEstering/alpaka/AlpakaCore/CachingAllocator.h b/CLUEstering/alpaka/AlpakaCore/CachingAllocator.h
@@ -85,14 +85,16 @@ namespace cms::alpakatools {
   template <typename TDevice, typename TQueue>
   class CachingAllocator {
   public:
-    using Device = TDevice;              // the "memory device", where the memory will be allocated
-    using Queue = TQueue;                // the queue used to submit the memory operations
+    using Device = TDevice;  // the "memory device", where the memory will be allocated
+    using Queue = TQueue;    // the queue used to submit the memory operations
     using Event = alpaka::Event<Queue>;  // the events used to synchronise the operations
     using Buffer = alpaka::Buf<Device, std::byte, alpaka::DimInt<1u>, size_t>;
 
     // The "memory device" type can either be the same as the "synchronisation device" type, or be the host CPU.
-    static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
-                  "The \"memory device\" type can either be the same as the \"synchronisation device\" "
+    static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or
+                      std::is_same_v<Device, alpaka::DevCpu>,
+                  "The \"memory device\" type can either be the same as the "
+                  "\"synchronisation device\" "
                   "type, or be the "
                   "host CPU.");
 
@@ -112,8 +114,8 @@ namespace cms::alpakatools {
         size_t maxCachedBytes,   // total storage for the allocator (0 means no limit);
         double
             maxCachedFraction,  // fraction of total device memory taken for the allocator (0 means no limit);
-                                // if both maxCachedBytes and maxCachedFraction are non-zero,
-                                // the smallest resulting value is used.
+        // if both maxCachedBytes and maxCachedFraction are non-zero,
+        // the smallest resulting value is used.
         bool reuseSameQueueAllocations,  // reuse non-ready allocations if they are in the same queue as the new one;
         // this is safe only if all memory operations are scheduled in the same queue
         bool debug)
@@ -135,7 +137,8 @@ namespace cms::alpakatools {
             << "  resulting bins:\n";
         for (auto bin = minBin_; bin <= maxBin_; ++bin) {
           auto binSize = detail::power(binGrowth, bin);
-          out << "    " << std::right << std::setw(12) << detail::as_bytes(binSize) << '\n';
+          out << "    " << std::right << std::setw(12) << detail::as_bytes(binSize)
+              << '\n';
         }
         out << "  maximum amount of cached memory: " << detail::as_bytes(maxCachedBytes_);
         std::cout << out.str() << std::endl;
@@ -201,22 +204,26 @@ namespace cms::alpakatools {
 
         if (debug_) {
           std::ostringstream out;
-          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " returned " << block.bytes
-              << " bytes at " << ptr << " from associated queue " << block.queue->m_spQueueImpl.get()
-              << " , event " << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size()
-              << " available blocks cached (" << cachedBytes_.free << " bytes), " << liveBlocks_.size()
-              << " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl;
+          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " returned "
+              << block.bytes << " bytes at " << ptr << " from associated queue "
+              << block.queue->m_spQueueImpl.get() << " , event "
+              << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size()
+              << " available blocks cached (" << cachedBytes_.free << " bytes), "
+              << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
+              << " bytes) outstanding." << std::endl;
           std::cout << out.str() << std::endl;
         }
       } else {
         // if the buffer is not recached, it is automatically freed when block goes out of scope
         if (debug_) {
           std::ostringstream out;
-          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << block.bytes
-              << " bytes at " << ptr << " from associated queue " << block.queue->m_spQueueImpl.get()
-              << ", event " << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size()
-              << " available blocks cached (" << cachedBytes_.free << " bytes), " << liveBlocks_.size()
-              << " live blocks (" << cachedBytes_.live << " bytes) outstanding." << std::endl;
+          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed "
+              << block.bytes << " bytes at " << ptr << " from associated queue "
+              << block.queue->m_spQueueImpl.get() << ", event "
+              << block.event->m_spEventImpl.get() << " .\n\t\t " << cachedBlocks_.size()
+              << " available blocks cached (" << cachedBytes_.free << " bytes), "
+              << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
+              << " bytes) outstanding." << std::endl;
           std::cout << out.str() << std::endl;
         }
       }
@@ -257,10 +264,11 @@ namespace cms::alpakatools {
         return std::make_tuple(minBin_, minBinBytes_);
       }
       if (bytes > maxBinBytes_) {
-        throw std::runtime_error("Requested allocation size " + std::to_string(bytes) +
-                                 " bytes is too large for the caching detail with maximum bin " +
-                                 std::to_string(maxBinBytes_) +
-                                 " bytes. You might want to increase the maximum bin size");
+        throw std::runtime_error(
+            "Requested allocation size " + std::to_string(bytes) +
+            " bytes is too large for the caching detail with maximum bin " +
+            std::to_string(maxBinBytes_) +
+            " bytes. You might want to increase the maximum bin size");
       }
       unsigned int bin = minBin_;
       size_t binBytes = minBinBytes_;
@@ -301,11 +309,13 @@ namespace cms::alpakatools {
 
           if (debug_) {
             std::ostringstream out;
-            out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " reused cached block at "
-                << block.buffer->data() << " (" << block.bytes << " bytes) for queue "
-                << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get()
-                << " (previously associated with stream " << iBlock->second.queue->m_spQueueImpl.get()
-                << " , event " << iBlock->second.event->m_spEventImpl.get() << ")." << std::endl;
+            out << "\t" << deviceType_ << " " << alpaka::getName(device_)
+                << " reused cached block at " << block.buffer->data() << " ("
+                << block.bytes << " bytes) for queue " << block.queue->m_spQueueImpl.get()
+                << ", event " << block.event->m_spEventImpl.get()
+                << " (previously associated with stream "
+                << iBlock->second.queue->m_spQueueImpl.get() << " , event "
+                << iBlock->second.event->m_spEventImpl.get() << ")." << std::endl;
             std::cout << out.str() << std::endl;
           }
 
@@ -324,11 +334,14 @@ namespace cms::alpakatools {
         return alpaka::allocBuf<std::byte, size_t>(device_, bytes);
       } else if constexpr (std::is_same_v<Device, alpaka::DevCpu>) {
         // allocate pinned host memory accessible by the queue's platform
-        return alpaka::allocMappedBuf<alpaka::Pltf<alpaka::Dev<Queue>>, std::byte, size_t>(device_, bytes);
+        return alpaka::allocMappedBuf<alpaka::Pltf<alpaka::Dev<Queue>>, std::byte, size_t>(
+            device_, bytes);
       } else {
         // unsupported combination
-        static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or std::is_same_v<Device, alpaka::DevCpu>,
-                      "The \"memory device\" type can either be the same as the \"synchronisation device\" "
+        static_assert(std::is_same_v<Device, alpaka::Dev<Queue>> or
+                          std::is_same_v<Device, alpaka::DevCpu>,
+                      "The \"memory device\" type can either be the same as the "
+                      "\"synchronisation device\" "
                       "type, or be "
                       "the host CPU.");
       }
@@ -341,8 +354,9 @@ namespace cms::alpakatools {
         // the allocation attempt failed: free all cached blocks on the device and retry
         if (debug_) {
           std::ostringstream out;
-          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " failed to allocate "
-              << block.bytes << " bytes for queue " << block.queue->m_spQueueImpl.get()
+          out << "\t" << deviceType_ << " " << alpaka::getName(device_)
+              << " failed to allocate " << block.bytes << " bytes for queue "
+              << block.queue->m_spQueueImpl.get()
               << ", retrying after freeing cached allocations" << std::endl;
           std::cout << out.str() << std::endl;
         }
@@ -366,10 +380,10 @@ namespace cms::alpakatools {
 
       if (debug_) {
         std::ostringstream out;
-        out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " allocated new block at "
-            << block.buffer->data() << " (" << block.bytes << " bytes associated with queue "
-            << block.queue->m_spQueueImpl.get() << ", event " << block.event->m_spEventImpl.get() << "."
-            << std::endl;
+        out << "\t" << deviceType_ << " " << alpaka::getName(device_)
+            << " allocated new block at " << block.buffer->data() << " (" << block.bytes
+            << " bytes associated with queue " << block.queue->m_spQueueImpl.get()
+            << ", event " << block.event->m_spEventImpl.get() << "." << std::endl;
         std::cout << out.str() << std::endl;
       }
     }
@@ -383,10 +397,11 @@ namespace cms::alpakatools {
 
         if (debug_) {
           std::ostringstream out;
-          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed " << iBlock->second.bytes
-              << " bytes.\n\t\t  " << (cachedBlocks_.size() - 1) << " available blocks cached ("
-              << cachedBytes_.free << " bytes), " << liveBlocks_.size() << " live blocks ("
-              << cachedBytes_.live << " bytes) outstanding." << std::endl;
+          out << "\t" << deviceType_ << " " << alpaka::getName(device_) << " freed "
+              << iBlock->second.bytes << " bytes.\n\t\t  " << (cachedBlocks_.size() - 1)
+              << " available blocks cached (" << cachedBytes_.free << " bytes), "
+              << liveBlocks_.size() << " live blocks (" << cachedBytes_.live
+              << " bytes) outstanding." << std::endl;
           std::cout << out.str() << std::endl;
         }
 
@@ -395,18 +410,22 @@ namespace cms::alpakatools {
     }
 
     // TODO replace with a tbb::concurrent_multimap ?
-    using CachedBlocks = std::multimap<unsigned int, BlockDescriptor>;  // ordered by the allocation bin
+    using CachedBlocks =
+        std::multimap<unsigned int, BlockDescriptor>;  // ordered by the allocation bin
     // TODO replace with a tbb::concurrent_map ?
-    using BusyBlocks = std::map<void*, BlockDescriptor>;  // ordered by the address of the allocated memory
+    using BusyBlocks =
+        std::map<void*, BlockDescriptor>;  // ordered by the address of the allocated memory
 
-    inline static const std::string deviceType_ = boost::core::demangle(typeid(Device).name());
+    inline static const std::string deviceType_ =
+        boost::core::demangle(typeid(Device).name());
 
     mutable std::mutex mutex_;
     Device device_;  // the device where the memory is allocated
 
     CachedBytes cachedBytes_;
     CachedBlocks cachedBlocks_;  // Set of cached device allocations available for reuse
-    BusyBlocks liveBlocks_;      // map of pointers to the live device allocations currently in use
+    BusyBlocks
+        liveBlocks_;  // map of pointers to the live device allocations currently in use
 
     const unsigned int binGrowth_;  // Geometric growth factor for bin-sizes
     const unsigned int minBin_;

diff --git a/CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h b/CLUEstering/alpaka/AlpakaCore/HostOnlyTask.h
@@ -26,16 +26,22 @@ namespace alpaka {
     struct Enqueue<QueueCudaRtNonBlocking, HostOnlyTask> {
       using TApi = ApiCudaRt;
 
-      static void CUDART_CB callback(cudaStream_t /*queue*/, cudaError_t /*status*/, void* arg) {
+      static void CUDART_CB callback(cudaStream_t /*queue*/,
+                                     cudaError_t /*status*/,
+                                     void* arg) {
         //ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(status);
         std::unique_ptr<HostOnlyTask> pTask(static_cast<HostOnlyTask*>(arg));
         (*pTask)();
       }
 
-      ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task) -> void {
+      ALPAKA_FN_HOST static auto enqueue(QueueCudaRtNonBlocking& queue, HostOnlyTask task)
+          -> void {
         auto pTask = std::make_unique<HostOnlyTask>(std::move(task));
-        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(cudaStreamAddCallback(
-            alpaka::getNativeHandle(queue), callback, static_cast<void*>(pTask.release()), 0u));
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            cudaStreamAddCallback(alpaka::getNativeHandle(queue),
+                                  callback,
+                                  static_cast<void*>(pTask.release()),
+                                  0u));
       }
     };
 #endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
@@ -52,10 +58,14 @@ namespace alpaka {
         (*pTask)();
       }
 
-      ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task) -> void {
+      ALPAKA_FN_HOST static auto enqueue(QueueHipRtNonBlocking& queue, HostOnlyTask task)
+          -> void {
         auto pTask = std::make_unique<HostOnlyTask>(std::move(task));
-        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(hipStreamAddCallback(
-            alpaka::getNativeHandle(queue), callback, static_cast<void*>(pTask.release()), 0u));
+        ALPAKA_UNIFORM_CUDA_HIP_RT_CHECK(
+            hipStreamAddCallback(alpaka::getNativeHandle(queue),
+                                 callback,
+                                 static_cast<void*>(pTask.release()),
+                                 0u));
       }
     };
 #endif  // ALPAKA_ACC_GPU_HIP_ENABLED

diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaConfig.h b/CLUEstering/alpaka/AlpakaCore/alpakaConfig.h
@@ -36,7 +36,8 @@ namespace alpaka_common {
 
 // trick to force expanding ALPAKA_ACCELERATOR_NAMESPACE before stringification inside DEFINE_FWK_MODULE
 #define DEFINE_FWK_ALPAKA_MODULE2(name) DEFINE_FWK_MODULE(name)
-#define DEFINE_FWK_ALPAKA_MODULE(name) DEFINE_FWK_ALPAKA_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name)
+#define DEFINE_FWK_ALPAKA_MODULE(name) \
+  DEFINE_FWK_ALPAKA_MODULE2(ALPAKA_ACCELERATOR_NAMESPACE::name)
 
 #define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE2(name) DEFINE_FWK_EVENTSETUP_MODULE(name)
 #define DEFINE_FWK_ALPAKA_EVENTSETUP_MODULE(name) \

diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaDevices.h b/CLUEstering/alpaka/AlpakaCore/alpakaDevices.h
@@ -12,7 +12,8 @@
 namespace cms::alpakatools {
 
   // alpaka host device
-  inline const alpaka_common::DevHost host = alpaka::getDevByIdx<alpaka_common::PltfHost>(0u);
+  inline const alpaka_common::DevHost host =
+      alpaka::getDevByIdx<alpaka_common::PltfHost>(0u);
 
   // alpaka accelerator devices
   template <typename TPlatform>

diff --git a/CLUEstering/alpaka/AlpakaCore/alpakaFwd.h b/CLUEstering/alpaka/AlpakaCore/alpakaFwd.h
@@ -57,10 +57,14 @@ namespace alpaka {
     template <typename TApi, bool TBlocking>
     class QueueUniformCudaHipRt;
   }
-  using QueueCudaRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiCudaRt, true>;
-  using QueueCudaRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiCudaRt, false>;
-  using QueueHipRtBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiHipRt, true>;
-  using QueueHipRtNonBlocking = uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiHipRt, false>;
+  using QueueCudaRtBlocking =
+      uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiCudaRt, true>;
+  using QueueCudaRtNonBlocking =
+      uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiCudaRt, false>;
+  using QueueHipRtBlocking =
+      uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiHipRt, true>;
+  using QueueHipRtNonBlocking =
+      uniform_cuda_hip::detail::QueueUniformCudaHipRt<ApiHipRt, false>;
 
   // Events
   template <typename TDev>