Merge pull request #449 from beomki-yeo/refactor-clusterization2

(Experimental) Refactor Clusterization
acts-project · Oct 10, 2023 · af6dcca · af6dcca
2 parents d8f0bfd + 87b7dac
commit af6dcca
Show file tree

Hide file tree

Showing 23 changed files with 663 additions and 31 deletions.
diff --git a/core/include/traccc/clusterization/detail/measurement_creation_helper.hpp b/core/include/traccc/clusterization/detail/measurement_creation_helper.hpp
@@ -79,7 +79,7 @@ TRACCC_HOST inline void calc_cluster_properties(
 /// @param[in] module is the cell module where the cluster belongs to
 /// @param[in] module_link is the module index
 ///
-TRACCC_HOST void fill_measurement(
+TRACCC_HOST inline void fill_measurement(
     measurement_collection_types::host& measurements,
     const cell_collection_types::host& cluster, const cell_module& module,
     const unsigned int module_link) {
@@ -102,7 +102,7 @@ TRACCC_HOST void fill_measurement(
     if (totalWeight > 0.) {
         measurement m;
         m.module_link = module_link;
-        m.surface_link = detray::geometry::barcode{module.module};
+        m.surface_link = module.surface_link;
         // normalize the cell position
         m.local = mean;
         // normalize the variance

diff --git a/core/include/traccc/edm/cell.hpp b/core/include/traccc/edm/cell.hpp
@@ -13,6 +13,9 @@
 #include "traccc/edm/container.hpp"
 #include "traccc/geometry/pixel_data.hpp"
 
+// Detray include(s).
+#include "detray/geometry/barcode.hpp"
+
 namespace traccc {
 
 /// Definition of a detector module
@@ -23,7 +26,7 @@ namespace traccc {
 ///
 struct cell_module {
 
-    geometry_id module = 0;
+    detray::geometry::barcode surface_link{0u};
     transform3 placement = transform3{};
     scalar threshold = 0;
 
@@ -37,7 +40,7 @@ using cell_module_collection_types = collection_types<cell_module>;
 /// Equality operator for cell module
 TRACCC_HOST_DEVICE
 inline bool operator==(const cell_module& lhs, const cell_module& rhs) {
-    return lhs.module == rhs.module;
+    return lhs.surface_link == rhs.surface_link;
 }
 
 /// Definition for one detector cell

diff --git a/device/common/include/traccc/clusterization/device/impl/aggregate_cluster.ipp b/device/common/include/traccc/clusterization/device/impl/aggregate_cluster.ipp
@@ -106,8 +106,8 @@ inline void aggregate_cluster(
      */
     out.local = mean;
     out.variance = var;
+    out.surface_link = this_module.surface_link;
     out.module_link = module_link;
-    out.surface_link = detray::geometry::barcode{this_module.module};
     // The following will need to be filled properly "soon".
     out.meas_dim = 2u;
 }

diff --git a/device/common/include/traccc/clusterization/device/impl/reduce_problem_cell.ipp b/device/common/include/traccc/clusterization/device/impl/reduce_problem_cell.ipp
@@ -14,8 +14,8 @@ namespace traccc::device {
  * cluster.
  */
 TRACCC_HOST_DEVICE
-bool is_adjacent(channel_id ac0, channel_id ac1, channel_id bc0,
-                 channel_id bc1) {
+inline bool is_adjacent(channel_id ac0, channel_id ac1, channel_id bc0,
+                        channel_id bc1) {
     unsigned int p0 = (ac0 - bc0);
     unsigned int p1 = (ac1 - bc1);
 

diff --git a/device/cuda/CMakeLists.txt b/device/cuda/CMakeLists.txt
@@ -42,6 +42,8 @@ traccc_add_library( traccc_cuda cuda TYPE SHARED
   "src/seeding/seeding_algorithm.cpp"
   "src/cca/component_connection.cu"
   # Clusterization
+  "include/traccc/cuda/clusterization/experimental/clusterization_algorithm.hpp"
+  "src/clusterization/experimental/clusterization_algorithm.cu"
   "include/traccc/cuda/clusterization/clusterization_algorithm.hpp"
   "src/clusterization/clusterization_algorithm.cu"
   # Finding

diff --git a/device/cuda/include/traccc/cuda/clusterization/experimental/clusterization_algorithm.hpp b/device/cuda/include/traccc/cuda/clusterization/experimental/clusterization_algorithm.hpp
@@ -0,0 +1,73 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2023 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+// Local include(s).
+#include "traccc/cuda/utils/stream.hpp"
+
+// Project include(s).
+#include "traccc/edm/cell.hpp"
+#include "traccc/edm/measurement.hpp"
+#include "traccc/utils/algorithm.hpp"
+#include "traccc/utils/memory_resource.hpp"
+
+// VecMem include(s).
+#include <vecmem/utils/copy.hpp>
+
+namespace traccc::cuda::experimental {
+
+/// Algorithm performing hit clusterization
+///
+/// This algorithm implements hit clusterization in a massively-parallel
+/// approach. Each thread handles a pre-determined number of detector cells.
+///
+/// This algorithm returns a buffer which is not necessarily filled yet. A
+/// synchronisation statement is required before destroying this buffer.
+///
+class clusterization_algorithm
+    : public algorithm<measurement_collection_types::buffer(
+          const cell_collection_types::const_view&,
+          const cell_module_collection_types::const_view&)> {
+
+    public:
+    /// Constructor for clusterization algorithm
+    ///
+    /// @param mr The memory resource(s) to use in the algorithm
+    /// @param copy The copy object to use for copying data between device
+    ///             and host memory blocks
+    /// @param str The CUDA stream to perform the operations in
+    /// @param target_cells_per_partition the average number of cells in each
+    /// partition
+    ///
+    clusterization_algorithm(const traccc::memory_resource& mr,
+                             vecmem::copy& copy, stream& str,
+                             const unsigned short target_cells_per_partition);
+    // const unsigned short target_cells_per_partition);
+
+    /// Callable operator for clusterization algorithm
+    ///
+    /// @param cells        a collection of cells
+    /// @param modules      a collection of modules
+    /// @return a spacepoint collection (buffer) and a collection (buffer)
+    /// of links from cells to the spacepoints they belong to.
+    output_type operator()(
+        const cell_collection_types::const_view& cells,
+        const cell_module_collection_types::const_view& modules) const override;
+
+    private:
+    /// The average number of cells in each partition
+    unsigned short m_target_cells_per_partition;
+    /// The memory resource(s) to use
+    traccc::memory_resource m_mr;
+    /// The copy object to use
+    vecmem::copy& m_copy;
+    /// The CUDA stream to use
+    stream& m_stream;
+};
+
+}  // namespace traccc::cuda::experimental
diff --git a/device/cuda/src/clusterization/experimental/clusterization_algorithm.cu b/device/cuda/src/clusterization/experimental/clusterization_algorithm.cu
@@ -0,0 +1,147 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+// CUDA Library include(s).
+#include "../../utils/utils.hpp"
+#include "traccc/cuda/clusterization/experimental/clusterization_algorithm.hpp"
+#include "traccc/cuda/utils/barrier.hpp"
+#include "traccc/cuda/utils/definitions.hpp"
+
+// Project include(s)
+#include "traccc/clusterization/device/aggregate_cluster.hpp"
+#include "traccc/clusterization/device/ccl_kernel.hpp"
+#include "traccc/clusterization/device/reduce_problem_cell.hpp"
+
+// Vecmem include(s).
+#include <vecmem/utils/copy.hpp>
+
+namespace traccc::cuda::experimental {
+
+namespace {
+/// These indices in clusterization will only range from 0 to
+/// max_cells_per_partition, so we only need a short.
+using index_t = unsigned short;
+
+static constexpr int TARGET_CELLS_PER_THREAD = 8;
+static constexpr int MAX_CELLS_PER_THREAD = 12;
+}  // namespace
+
+namespace kernels {
+
+/// CUDA kernel for running @c traccc::device::ccl_kernel
+__global__ void ccl_kernel(
+    const cell_collection_types::const_view cells_view,
+    const cell_module_collection_types::const_view modules_view,
+    const index_t max_cells_per_partition,
+    const index_t target_cells_per_partition,
+    measurement_collection_types::view measurements_view,
+    unsigned int& measurement_count,
+    vecmem::data::vector_view<unsigned int> cell_links) {
+    __shared__ unsigned int partition_start, partition_end;
+    __shared__ unsigned int outi;
+    extern __shared__ index_t shared_v[];
+    index_t* f = &shared_v[0];
+    index_t* f_next = &shared_v[max_cells_per_partition];
+    traccc::cuda::barrier barry_r;
+
+    device::ccl_kernel(threadIdx.x, blockDim.x, blockIdx.x, cells_view,
+                       modules_view, max_cells_per_partition,
+                       target_cells_per_partition, partition_start,
+                       partition_end, outi, f, f_next, barry_r,
+                       measurements_view, measurement_count, cell_links);
+}
+
+}  // namespace kernels
+
+clusterization_algorithm::clusterization_algorithm(
+    const traccc::memory_resource& mr, vecmem::copy& copy, stream& str,
+    const unsigned short target_cells_per_partition)
+    : m_mr(mr),
+      m_copy(copy),
+      m_stream(str),
+      m_target_cells_per_partition(target_cells_per_partition) {}
+
+clusterization_algorithm::output_type clusterization_algorithm::operator()(
+    const cell_collection_types::const_view& cells,
+    const cell_module_collection_types::const_view& modules) const {
+
+    // Get a convenience variable for the stream that we'll be using.
+    cudaStream_t stream = details::get_stream(m_stream);
+
+    // Number of cells
+    const cell_collection_types::view::size_type num_cells =
+        m_copy.get_size(cells);
+
+    if (num_cells == 0) {
+        return {0, m_mr.main};
+    }
+
+    // Create result object for the CCL kernel with size overestimation
+    measurement_collection_types::buffer measurements_buffer(num_cells,
+                                                             m_mr.main);
+    m_copy.setup(measurements_buffer);
+
+    // Counter for number of measurements
+    vecmem::unique_alloc_ptr<unsigned int> num_measurements_device =
+        vecmem::make_unique_alloc<unsigned int>(m_mr.main);
+    CUDA_ERROR_CHECK(cudaMemsetAsync(num_measurements_device.get(), 0,
+                                     sizeof(unsigned int), stream));
+
+    const unsigned short max_cells_per_partition =
+        (m_target_cells_per_partition * MAX_CELLS_PER_THREAD +
+         TARGET_CELLS_PER_THREAD - 1) /
+        TARGET_CELLS_PER_THREAD;
+    const unsigned int threads_per_partition =
+        (m_target_cells_per_partition + TARGET_CELLS_PER_THREAD - 1) /
+        TARGET_CELLS_PER_THREAD;
+    const unsigned int num_partitions =
+        (num_cells + m_target_cells_per_partition - 1) /
+        m_target_cells_per_partition;
+
+    // Create buffer for linking cells to their spacepoints.
+    vecmem::data::vector_buffer<unsigned int> cell_links(num_cells, m_mr.main);
+    m_copy.setup(cell_links);
+
+    // Launch ccl kernel. Each thread will handle a single cell.
+    kernels::
+        ccl_kernel<<<num_partitions, threads_per_partition,
+                     2 * max_cells_per_partition * sizeof(index_t), stream>>>(
+            cells, modules, max_cells_per_partition,
+            m_target_cells_per_partition, measurements_buffer,
+            *num_measurements_device, cell_links);
+
+    CUDA_ERROR_CHECK(cudaGetLastError());
+
+    // Copy number of measurements to host
+    vecmem::unique_alloc_ptr<unsigned int> num_measurements_host =
+        vecmem::make_unique_alloc<unsigned int>(
+            (m_mr.host != nullptr) ? *(m_mr.host) : m_mr.main);
+    CUDA_ERROR_CHECK(cudaMemcpyAsync(
+        num_measurements_host.get(), num_measurements_device.get(),
+        sizeof(unsigned int), cudaMemcpyDeviceToHost, stream));
+    m_stream.synchronize();
+
+    // Create a new measurement buffer with a right size
+    measurement_collection_types::buffer new_measurements_buffer(
+        *num_measurements_host, m_mr.main);
+    m_copy.setup(new_measurements_buffer);
+
+    vecmem::device_vector<measurement> measurements_device(measurements_buffer);
+    vecmem::device_vector<measurement> new_measurements_device(
+        new_measurements_buffer);
+
+    CUDA_ERROR_CHECK(cudaMemcpyAsync(
+        new_measurements_device.begin(), measurements_device.begin(),
+        sizeof(measurement) * (*num_measurements_host),
+        cudaMemcpyDeviceToDevice, stream));
+
+    m_stream.synchronize();
+
+    return new_measurements_buffer;
+}
+
+}  // namespace traccc::cuda::experimental
diff --git a/device/sycl/CMakeLists.txt b/device/sycl/CMakeLists.txt
@@ -15,6 +15,7 @@ enable_language( SYCL )
 traccc_add_library( traccc_sycl sycl TYPE SHARED
   # header files
   "include/traccc/sycl/clusterization/clusterization_algorithm.hpp"
+  "include/traccc/sycl/clusterization/experimental/clusterization_algorithm.hpp"
   "include/traccc/sycl/fitting/fitting_algorithm.hpp"
   "include/traccc/sycl/seeding/experimental/spacepoint_formation.hpp"
   "include/traccc/sycl/seeding/seeding_algorithm.hpp"
@@ -27,6 +28,7 @@ traccc_add_library( traccc_sycl sycl TYPE SHARED
   "include/traccc/sycl/utils/barrier.hpp"
   # implementation files
   "src/clusterization/clusterization_algorithm.sycl"
+  "src/clusterization/experimental/clusterization_algorithm.sycl"
   "src/fitting/fitting_algorithm.sycl"
   "src/seeding/experimental/spacepoint_formation.sycl"
   "src/seeding/seed_finding.sycl"

diff --git a/device/sycl/include/traccc/sycl/clusterization/experimental/clusterization_algorithm.hpp b/device/sycl/include/traccc/sycl/clusterization/experimental/clusterization_algorithm.hpp
@@ -0,0 +1,76 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2023 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+// SYCL library include(s).
+#include "traccc/sycl/utils/queue_wrapper.hpp"
+
+// Project include(s).
+#include "traccc/edm/cell.hpp"
+#include "traccc/edm/measurement.hpp"
+#include "traccc/utils/algorithm.hpp"
+#include "traccc/utils/memory_resource.hpp"
+
+// VecMem include(s).
+#include <vecmem/utils/copy.hpp>
+
+namespace traccc::sycl::experimental {
+
+/// Algorithm performing hit clusterization
+///
+/// This algorithm implements hit clusterization in a massively-parallel
+/// approach. Each thread handles a pre-determined number of detector cells.
+///
+/// This algorithm returns a buffer which is not necessarily filled yet. A
+/// synchronisation statement is required before destroying this buffer.
+///
+class clusterization_algorithm
+    : public algorithm<measurement_collection_types::buffer(
+          const cell_collection_types::const_view&,
+          const cell_module_collection_types::const_view&)> {
+
+    public:
+    /// Constructor for clusterization algorithm
+    ///
+    /// @param mr The memory resource(s) to use in the algorithm
+    /// @param copy The copy object to use for copying data between device
+    ///             and host memory blocks
+    /// @param str The CUDA stream to perform the operations in
+    /// @param target_cells_per_partition the average number of cells in each
+    /// partition
+    ///
+    clusterization_algorithm(const traccc::memory_resource& mr,
+                             vecmem::copy& copy, queue_wrapper queue,
+                             const unsigned short target_cells_per_partition);
+    // const unsigned short target_cells_per_partition);
+
+    /// Callable operator for clusterization algorithm
+    ///
+    /// @param cells        a collection of cells
+    /// @param modules      a collection of modules
+    /// @return a spacepoint collection (buffer) and a collection (buffer)
+    /// of links from cells to the spacepoints they belong to.
+    output_type operator()(
+        const cell_collection_types::const_view& cells,
+        const cell_module_collection_types::const_view& modules) const override;
+
+    private:
+    /// The average number of cells in each partition
+    unsigned short m_target_cells_per_partition;
+    /// The maximum number of threads in a work group
+    unsigned int m_max_work_group_size;
+
+    /// The memory resource(s) to use
+    traccc::memory_resource m_mr;
+    /// The copy object to use
+    vecmem::copy& m_copy;
+    /// The SYCL queue object
+    mutable queue_wrapper m_queue;
+};
+
+}  // namespace traccc::sycl::experimental