From b9218a2779ab47e539ab315d4effe680fe499b9c Mon Sep 17 00:00:00 2001 From: Stephen Nicholas Swatman Date: Tue, 20 Sep 2022 10:17:23 +0200 Subject: [PATCH] New seeding --- device/cuda/CMakeLists.txt | 5 + .../cuda/seeding2/kernels/kd_tree_kernel.hpp | 24 + .../seeding2/kernels/seed_finding_kernel.hpp | 38 + .../seeding2/kernels/write_output_kernel.hpp | 25 + .../traccc/cuda/seeding2/seed_finding.hpp | 35 + .../cuda/seeding2/types/internal_seed.hpp | 29 + .../traccc/cuda/seeding2/types/kd_tree.hpp | 54 ++ .../traccc/cuda/seeding2/types/range3d.hpp | 58 ++ .../src/seeding2/kernels/kd_tree_kernel.cu | 807 ++++++++++++++++++ .../seeding2/kernels/seed_finding_kernel.cu | 715 ++++++++++++++++ .../seeding2/kernels/write_output_kernel.cu | 108 +++ device/cuda/src/seeding2/seed_finding.cu | 108 +++ 12 files changed, 2006 insertions(+) create mode 100644 device/cuda/include/traccc/cuda/seeding2/kernels/kd_tree_kernel.hpp create mode 100644 device/cuda/include/traccc/cuda/seeding2/kernels/seed_finding_kernel.hpp create mode 100644 device/cuda/include/traccc/cuda/seeding2/kernels/write_output_kernel.hpp create mode 100644 device/cuda/include/traccc/cuda/seeding2/seed_finding.hpp create mode 100644 device/cuda/include/traccc/cuda/seeding2/types/internal_seed.hpp create mode 100644 device/cuda/include/traccc/cuda/seeding2/types/kd_tree.hpp create mode 100644 device/cuda/include/traccc/cuda/seeding2/types/range3d.hpp create mode 100644 device/cuda/src/seeding2/kernels/kd_tree_kernel.cu create mode 100644 device/cuda/src/seeding2/kernels/seed_finding_kernel.cu create mode 100644 device/cuda/src/seeding2/kernels/write_output_kernel.cu create mode 100644 device/cuda/src/seeding2/seed_finding.cu diff --git a/device/cuda/CMakeLists.txt b/device/cuda/CMakeLists.txt index 74afa34f51..6089b21e27 100644 --- a/device/cuda/CMakeLists.txt +++ b/device/cuda/CMakeLists.txt @@ -30,6 +30,11 @@ traccc_add_library( traccc_cuda cuda TYPE SHARED # Clusterization "include/traccc/cuda/clusterization/clusterization_algorithm.hpp" "src/clusterization/clusterization_algorithm.cu" + # Alternate seeding. + "src/seeding2/seed_finding.cu" + "src/seeding2/kernels/seed_finding_kernel.cu" + "src/seeding2/kernels/kd_tree_kernel.cu" + "src/seeding2/kernels/write_output_kernel.cu" ) target_compile_options( traccc_cuda PRIVATE $<$:--expt-relaxed-constexpr> ) diff --git a/device/cuda/include/traccc/cuda/seeding2/kernels/kd_tree_kernel.hpp b/device/cuda/include/traccc/cuda/seeding2/kernels/kd_tree_kernel.hpp new file mode 100644 index 0000000000..ed8677ff58 --- /dev/null +++ b/device/cuda/include/traccc/cuda/seeding2/kernels/kd_tree_kernel.hpp @@ -0,0 +1,24 @@ +/** TRACCC library, part of the ACTS project (R&D line) + * + * (c) 2022 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +#include +#include +#include +#include + +namespace traccc::cuda { +/** + * @brief Creates a k-d tree from a given set of spacepoints. + * + * @return A pair containing the k-d tree nodes as well as the number of nodes. + */ +std::pair, uint32_t> create_kd_tree( + vecmem::memory_resource&, const internal_spacepoint* const, + uint32_t); +} // namespace traccc::cuda diff --git a/device/cuda/include/traccc/cuda/seeding2/kernels/seed_finding_kernel.hpp b/device/cuda/include/traccc/cuda/seeding2/kernels/seed_finding_kernel.hpp new file mode 100644 index 0000000000..99c795504f --- /dev/null +++ b/device/cuda/include/traccc/cuda/seeding2/kernels/seed_finding_kernel.hpp @@ -0,0 +1,38 @@ +/** TRACCC library, part of the ACTS project (R&D line) + * + * (c) 2022 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace traccc::cuda { +/** + * @brief Convenience data structure for all the data we need for seeding. + */ +struct seed_finding_data_t { + const seedfinder_config finder_conf; + const seedfilter_config filter_conf; + const internal_spacepoint* spacepoints; + const kd_tree_t* tree; + const uint32_t tree_nodes; +}; + +/** + * @brief Execute the seed finding kernel itself. + * + * @return A pair containing the list of internal seeds as well as the number + * of seeds. + */ +std::pair, uint32_t> run_seeding( + seedfinder_config, seedfilter_config, vecmem::memory_resource&, + const internal_spacepoint* const, uint32_t, + vecmem::unique_alloc_ptr&&, uint32_t); +} // namespace traccc::cuda diff --git a/device/cuda/include/traccc/cuda/seeding2/kernels/write_output_kernel.hpp b/device/cuda/include/traccc/cuda/seeding2/kernels/write_output_kernel.hpp new file mode 100644 index 0000000000..4e28edcb54 --- /dev/null +++ b/device/cuda/include/traccc/cuda/seeding2/kernels/write_output_kernel.hpp @@ -0,0 +1,25 @@ +/** TRACCC library, part of the ACTS project (R&D line) + * + * (c) 2022 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace traccc::cuda { +/** + * @brief Kernel to write output data back into traccc's EDM. + * + * @return A vector buffer containing the output seeds. + */ +vecmem::data::vector_buffer write_output( + const traccc::memory_resource &, uint32_t, + const internal_spacepoint *const, const internal_seed *const); +} // namespace traccc::cuda diff --git a/device/cuda/include/traccc/cuda/seeding2/seed_finding.hpp b/device/cuda/include/traccc/cuda/seeding2/seed_finding.hpp new file mode 100644 index 0000000000..17bc9e3110 --- /dev/null +++ b/device/cuda/include/traccc/cuda/seeding2/seed_finding.hpp @@ -0,0 +1,35 @@ +/** TRACCC library, part of the ACTS project (R&D line) + * + * (c) 2022 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace traccc::cuda { +/** + * @brief Alternative seed finding algorithm, using orthogonal range search + * implemented through a k-d tree. + */ +class seed_finding2 : public algorithm( + const spacepoint_container_types::const_view&)> { + public: + seed_finding2(const traccc::memory_resource& mr); + + vecmem::data::vector_buffer operator()( + const spacepoint_container_types::const_view& sps) const override; + + private: + traccc::memory_resource m_output_mr; + seedfinder_config m_finder_conf; + seedfilter_config m_filter_conf; +}; +} // namespace traccc::cuda diff --git a/device/cuda/include/traccc/cuda/seeding2/types/internal_seed.hpp b/device/cuda/include/traccc/cuda/seeding2/types/internal_seed.hpp new file mode 100644 index 0000000000..e26919a39b --- /dev/null +++ b/device/cuda/include/traccc/cuda/seeding2/types/internal_seed.hpp @@ -0,0 +1,29 @@ +/** TRACCC library, part of the ACTS project (R&D line) + * + * (c) 2022 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +#include +#include + +namespace traccc::cuda { +/** + * @brief Internal structure representing a single seed. + */ +struct internal_seed { + std::array spacepoints; + float weight; + + __host__ __device__ static internal_seed Invalid() { + internal_seed r; + + r.weight = std::numeric_limits::lowest(); + + return r; + } +}; +} // namespace traccc::cuda diff --git a/device/cuda/include/traccc/cuda/seeding2/types/kd_tree.hpp b/device/cuda/include/traccc/cuda/seeding2/types/kd_tree.hpp new file mode 100644 index 0000000000..7867d87b6d --- /dev/null +++ b/device/cuda/include/traccc/cuda/seeding2/types/kd_tree.hpp @@ -0,0 +1,54 @@ +/** TRACCC library, part of the ACTS project (R&D line) + * + * (c) 2022 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +#include + +namespace traccc::cuda { +/** + * @brief Data structure representing a node in a k-d tree. + * + * @tparam _value_t Type of values in the tree. + * @tparam _nodes_i Number of values in each leaf node. + */ +template +struct kd_tree_node { + struct element_wrapper_t { + _value_t value; + float phi, r, z; + }; + + enum class nodetype_e { LEAF, INTERNAL, NON_EXTANT, INCOMPLETE }; + + enum class pivot_e { Phi, R, Z }; + + static constexpr std::size_t MAX_NODES_PER_LEAF = _nodes_i; + + nodetype_e type; + + range3d range; + + union { + struct { + element_wrapper_t points[MAX_NODES_PER_LEAF]; + uint32_t point_count; + } leaf; + struct { + pivot_e dim; + float mid; + } internal; + struct { + } non_extant; + struct { + uint32_t begin, end; + } incomplete; + }; +}; + +using kd_tree_t = kd_tree_node; +} // namespace traccc::cuda diff --git a/device/cuda/include/traccc/cuda/seeding2/types/range3d.hpp b/device/cuda/include/traccc/cuda/seeding2/types/range3d.hpp new file mode 100644 index 0000000000..e4bb6172a1 --- /dev/null +++ b/device/cuda/include/traccc/cuda/seeding2/types/range3d.hpp @@ -0,0 +1,58 @@ +/** TRACCC library, part of the ACTS project (R&D line) + * + * (c) 2022 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +#include +#include + +namespace traccc::cuda { +/** + * @brief Range in three dimensions, defined by minima and maxima in the phi, + * r, and z dimensions. + */ +struct range3d { + float phi_min, phi_max, r_min, r_max, z_min, z_max; + + __host__ __device__ static range3d Infinite() { + range3d r; + + r.phi_min = std::numeric_limits::lowest(); + r.phi_max = std::numeric_limits::max(); + r.r_min = 0.f; + r.r_max = std::numeric_limits::max(); + r.z_min = std::numeric_limits::lowest(); + r.z_max = std::numeric_limits::max(); + + return r; + } + + __host__ __device__ static range3d Degenerate() { + range3d r; + + r.phi_min = std::numeric_limits::max(); + r.phi_max = std::numeric_limits::lowest(); + r.r_min = std::numeric_limits::max(); + r.r_max = 0.f; + r.z_min = std::numeric_limits::max(); + r.z_max = std::numeric_limits::lowest(); + + return r; + } + + __host__ __device__ bool intersects(const range3d& o) const { + return phi_min <= o.phi_max && o.phi_min < phi_max && + r_min <= o.r_max && o.r_min < r_max && z_min <= o.z_max && + o.z_min < z_max; + } + + __host__ __device__ bool contains(float phi, float r, float z) const { + return phi_min <= phi && phi <= phi_max && r_min <= r && r <= r_max && + z_min <= z && z <= z_max; + } +}; +} // namespace traccc::cuda diff --git a/device/cuda/src/seeding2/kernels/kd_tree_kernel.cu b/device/cuda/src/seeding2/kernels/kd_tree_kernel.cu new file mode 100644 index 0000000000..344d0419cc --- /dev/null +++ b/device/cuda/src/seeding2/kernels/kd_tree_kernel.cu @@ -0,0 +1,807 @@ +/** TRACCC library, part of the ACTS project (R&D line) + * + * (c) 2022 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace traccc::cuda { +/** + * @brief Bit mask for halo point indices. + * + * Our phi-axis search range wraps around, and k-d trees do not natively + * support this. Thankfully, we can solve this problem by simply duplicating + * points at the edges of the phi range. These are called phi halo points. We + * mark these points by setting the MSB in their index. This means that, in + * principle, the index is invalid, but it also saves quite a lot of space. + * This mask contains the exact bit that is set as the mask. + */ +constexpr uint32_t AUXILIARY_INDEX_MASK = (1u << 31); + +/** + * @brief Width of phi-halo range. + * + * This value defines the range in which points are copied and treated as halo + * points. If this width is ε, then points below -π + ε and above π - ε are + * halo points. This value should be at least as large as the maximum phi delta + * betweel spacepoints. + */ +constexpr float PHI_HALO_WIDTH = 0.06f; + +/** + * @brief Get coordinate value for a halo-aware index in a given list of + * spacepoints. + * + * For non-halo points, this function simply returns the phi value of the + * point. But for halo points, it calculates the proper adjusted phi value. For + * points in the range -π ≤ φ ≤ -π + ε, this is φ + 2π. For points in the range + * π - ε ≤ φ ≤ π it is φ - 2π. + * + * @param[in] spacepoints The array of spacepoints. + * @param[in] i The halo-aware index in the spacepoint array. + * + * @return The ϕ value of the spacepoint with the given index. + */ +__device__ float get_coordinate( + const internal_spacepoint* const __restrict__ spacepoints, + uint32_t i, kd_tree_t::pivot_e dim) { + bool is_real = !(i & AUXILIARY_INDEX_MASK); + + if (is_real) { + if (dim == kd_tree_t::pivot_e::Phi) { + return spacepoints[i].phi(); + } else if (dim == kd_tree_t::pivot_e::R) { + return spacepoints[i].radius(); + } else { + return spacepoints[i].z(); + } + } else { + uint32_t real_i = i & ~AUXILIARY_INDEX_MASK; + + if (dim == kd_tree_t::pivot_e::Phi) { + float phi = spacepoints[real_i].phi(); + + if (phi >= 0) { + return phi - 2 * M_PI; + } else { + return phi + 2 * M_PI; + } + } else if (dim == kd_tree_t::pivot_e::R) { + return spacepoints[real_i].radius(); + } else { + return spacepoints[real_i].z(); + } + } +} + +/** + * @brief Initialize the index vector used to build the k-d tree. + * + * Our k-d tree construction algorithm keeps track of spacepoints using a big + * index array, which is initialized here. Given n spacepoints, this has n + m + * elements, where m is the number of halo points. + * + * @param[in] spacepoints The list of spacepoints. + * @param[out] indices The output array to write to. + * @param[in] sps The total number of spacepoints. + * @param[out] extra_indices The output halo point count. + */ +__global__ void initialize_index_vector( + const internal_spacepoint* const __restrict__ spacepoints, + uint32_t* __restrict__ indices, uint32_t sps, + uint32_t* __restrict__ extra_indices) { + cooperative_groups::grid_group grid = cooperative_groups::this_grid(); + + if (grid.thread_rank() < sps) { + indices[grid.thread_rank()] = grid.thread_rank(); + + /* + * If we are a halo point, add an extra index for our evil twin! + */ + if (spacepoints[grid.thread_rank()].phi() < (-M_PI + PHI_HALO_WIDTH) || + spacepoints[grid.thread_rank()].phi() > (M_PI - PHI_HALO_WIDTH)) { + indices[sps + atomicAdd(extra_indices, 1u)] = + grid.thread_rank() | AUXILIARY_INDEX_MASK; + } + } +} + +/** + * @brief Kernel that initializes the k-d tree. + * + * This kernel simply sets every node in a given k-d tree to be non-extant + * except for the root (at index 0) which is considered an incomplete node. + * + * @param[out] tree The array of tree nodes to write to. + * @param[in] num_nodes The total number of tree nodes. + * @param[in] num_indices The total number of indices. + */ +__global__ void initialize_kd_tree(kd_tree_t* __restrict__ tree, + uint32_t num_nodes, uint32_t num_indices) { + cooperative_groups::grid_group grid = cooperative_groups::this_grid(); + + if (grid.thread_rank() == 0) { + /* + * The root node starts as an incomplete node covering all indices in + * the index array. Its range is initialized as an infinite range. + */ + tree[0].type = kd_tree_t::nodetype_e::INCOMPLETE; + tree[0].incomplete.begin = 0; + tree[0].incomplete.end = num_indices; + tree[0].range = range3d::Infinite(); + } else if (grid.thread_rank() < num_nodes) { + /* + * All other nodes are non-extant. + */ + tree[grid.thread_rank()].type = kd_tree_t::nodetype_e::NON_EXTANT; + } +} + +/** + * @brief Helper function to determine the next pivot axis. + * + * k-d trees work by pivoting along a certain axis. In this implementation, we + * simply cycle through dimensions because that's very fast. We go in the order + * ϕ to r to z, and then repeat. + * + * @param[in] p The old pivot axis. + * + * @return The new pivot axis. + */ +__device__ typename kd_tree_t::pivot_e next_pivot( + typename kd_tree_t::pivot_e p) { + if (p == kd_tree_t::pivot_e::Phi) { + return kd_tree_t::pivot_e::R; + } else if (p == kd_tree_t::pivot_e::R) { + return kd_tree_t::pivot_e::Z; + } else { + return kd_tree_t::pivot_e::Phi; + } +} + +/** + * @brief Approximate mean finding algorithm for large numbers of spacepoints. + * + * When we have a lot of spacepoints to find the median for, doing so exactly + * is too slow. Thus, we use a sampling approach; we select a subset of points + * and calculate the median of those points. + * + * @param[in] spacepoints The list of spacepoints. + * @param[inout] indices The list of indices for _this_ computation. + * @param[in] num_indices The number of indices for _this_ function call. + * @param[in] pivot The requested axis along which to split. + * + * @return The approximate median value of the given indices in the given + * pivot axis. + */ +__device__ float get_approximate_median( + const internal_spacepoint* const __restrict__ spacepoints, + uint32_t* __restrict__ indices, uint32_t num_indices, + typename kd_tree_t::pivot_e pivot) { + cooperative_groups::thread_block block = + cooperative_groups::this_thread_block(); + + /* + * First, we declare some shared memory which will hold our samples from + * which to compute the median. We assume a maximum of one item per thread. + */ + extern __shared__ float values[]; + + /* + * Assert that the number of indices is at least as big as the block size, + * otherwise this computation might not work as designed! + */ + assert(num_indices >= block.size()); + + /* + * Calculate the stride between samples in the spacepoint array. + */ + uint32_t delta = num_indices / block.size(); + + /* + * Calculate the index for the current thread. + */ + uint32_t index = indices[(block.thread_rank() * delta) % num_indices]; + + /* + * Calculate the value of the chosen spacepoint in the requested pivot + * dimension and store it in the shared memory. + */ + values[block.thread_rank()] = get_coordinate(spacepoints, index, pivot); + + bool sorted; + + block.sync(); + + /* + * Implementation of odd-even sort to make sure that the values in our + * shared memory are sorted. For small ranges as we have here, this is + * reasonably efficient. + */ + do { + sorted = true; + + /* + * Odd step. + */ + for (uint32_t j = 2 * block.thread_rank() + 1; j < block.size() - 1; + j += 2 * block.size()) { + float v1 = values[j]; + float v2 = values[j + 1]; + + if (v1 > v2) { + values[j + 1] = v1; + values[j] = v2; + sorted = false; + } + } + + block.sync(); + + /* + * Even step. + */ + for (uint32_t j = 2 * block.thread_rank(); j < block.size() - 1; + j += 2 * block.size()) { + float v1 = values[j]; + float v2 = values[j + 1]; + + if (v1 > v2) { + values[j + 1] = v1; + values[j] = v2; + sorted = false; + } + } + } while (__syncthreads_or(!sorted)); + + /* + * Now that our sorting is complete, we can simply return the middle value + * in the range to get the median. + */ + return values[(block.size() + 1) / 2]; +} + +/** + * @brief Exact mean finding algorithm for small ranges. + * + * When the array of indices is relatively small, we can afford to find the + * exact median. We do this by sorting the data and then returning the index + * (rather than the value) of the median. + * + * @warning This function has the side effect that it leaves the index array + * sorted! + * + * @param[in] spacepoints The array of spacepoints. + * @param[inout] indices The array of indices for this function call. + * @param[in] num_indices The number of indices for this function call. + * @param[in] pivot The requested pivot axis. + */ +__device__ uint32_t sort_and_get_median( + const internal_spacepoint* const __restrict__ spacepoints, + uint32_t* __restrict__ indices, uint32_t num_indices, + typename kd_tree_t::pivot_e pivot) { + cooperative_groups::thread_block block = + cooperative_groups::this_thread_block(); + + /* + * Simple perform an odd-even sort to sort the range of indices by the + * value of the corresponding SPs in the requested pivot axis. + */ + bool sorted; + + do { + sorted = true; + + /* + * Odd step. + */ + for (uint32_t j = 2 * block.thread_rank() + 1; j < num_indices - 1; + j += 2 * block.size()) { + float wJ = get_coordinate(spacepoints, indices[j], pivot); + float wM = get_coordinate(spacepoints, indices[j + 1], pivot); + + bool swap = wJ > wM; + uint32_t e = indices[j + 1]; + + if (swap) { + indices[j + 1] = indices[j]; + indices[j] = e; + sorted = false; + } + } + + block.sync(); + + /* + * Even step. + */ + for (uint32_t j = 2 * block.thread_rank(); j < num_indices - 1; + j += 2 * block.size()) { + float wJ = get_coordinate(spacepoints, indices[j], pivot); + float wM = get_coordinate(spacepoints, indices[j + 1], pivot); + + bool swap = wJ > wM; + uint32_t e = indices[j + 1]; + + if (swap) { + indices[j + 1] = indices[j]; + indices[j] = e; + sorted = false; + } + } + } while (__syncthreads_or(!sorted)); + + /* + * Remember, we are returning an index here, not a value! + */ + return (num_indices + 1) / 2; +} + +/** + * @brief Create a leaf node in the given k-d tree. + * + * @param[inout] tree The tree in which to create the leaf node. + * @param[in] spacepoints The global array of spacepoints. + * @param[in] indices The indices from which to create a leaf node. + * @param[in] nid The node index. + */ +__device__ bool create_leaf_node( + kd_tree_t* __restrict__ tree, + const internal_spacepoint* const __restrict__ spacepoints, + const uint32_t* const __restrict__ indices, int nid) { + cooperative_groups::thread_block block = + cooperative_groups::this_thread_block(); + /* + * Get the current node. + */ + kd_tree_t& node = tree[nid]; + + /* + * Set the node to a leaf, and set the number of points it contains. + */ + if (block.thread_rank() == 0) { + node.type = kd_tree_t::nodetype_e::LEAF; + node.leaf.point_count = (node.incomplete.end - node.incomplete.begin); + } + + block.sync(); + + /* + * Store the given points in this leaf, retrieving their exact positions + * from the spacepoint array. + */ + for (uint32_t i = block.thread_rank(); i < node.leaf.point_count; + i += block.size()) { + uint32_t ridx = indices[i] & ~AUXILIARY_INDEX_MASK; + + node.leaf.points[i].value = ridx; + node.leaf.points[i].phi = + get_coordinate(spacepoints, indices[i], kd_tree_t::pivot_e::Phi); + node.leaf.points[i].r = spacepoints[ridx].radius(); + node.leaf.points[i].z = spacepoints[ridx].z(); + } + + /* + * This function always succeeds. + */ + return true; +} + +/** + * @brief Create an internal node in the k-d tree. + * + * This function constructs an internal node in the k-d tree at a given + * position. + * + * @warning This function has the side-effect of re-ordering the index array. + * + * @param[inout] tree The k-d tree in which to build the node. + * @param[in] spacepoints The spacepoint array. + * @param[inout] indices The indices from which to build an internal node. + * @param[inout] index_buffer Temporary storage for partitioning the indices. + * @param[in] nid The node index. + * @param[in] pivot The requested pivot axis. + */ +__device__ bool create_internal_node( + kd_tree_t* __restrict__ tree, + const internal_spacepoint* const __restrict__ spacepoints, + uint32_t* __restrict__ indices, uint32_t* __restrict__ index_buffer, + int nid, typename kd_tree_t::pivot_e pivot) { + cooperative_groups::thread_block block = + cooperative_groups::this_thread_block(); + + /* + * Get the node and it's left and right children. + */ + kd_tree_t& node = tree[nid]; + kd_tree_t& lhc = tree[2 * nid + 1]; + kd_tree_t& rhc = tree[2 * nid + 2]; + + /* + * Start the construction of the two children, only if we are the leader + * thread. + */ + if (block.thread_rank() == 0) { + lhc.type = kd_tree_t::nodetype_e::INCOMPLETE; + rhc.type = kd_tree_t::nodetype_e::INCOMPLETE; + } + + /* + * Let's hope this doesn't fire, but if we have zero (or god forbid a + * negative number) nodes, we have a big problem. + */ + assert(node.incomplete.end > node.incomplete.begin); + + uint32_t num_indices = node.incomplete.end - node.incomplete.begin; + + block.sync(); + + float mid_point_value; + + /* + * We have two different branches here; for small nodes we take an exact + * approach, for big nodes we make approximations. + */ + if (num_indices > block.size()) { + /* + * These counters are used for partitioning, and count how many points + * we have that are greater than our pivot, as well as how many are + * less than the pivot. + */ + __shared__ uint32_t lower_idx, upper_idx; + + if (block.thread_rank() == 0) { + lower_idx = 0; + upper_idx = 0; + } + + /* + * Find the approximate median value of the given spacepoints in the + * selected dimension. + */ + float mid_point = + get_approximate_median(spacepoints, indices, num_indices, pivot); + + block.sync(); + + /* + * Now, we partition the spacepoints (or rather, their indices) into + * lower and upper points, counting the number of elements we have in + * each. + */ + for (uint32_t i = block.thread_rank(); i < num_indices; + i += block.size()) { + if (get_coordinate(spacepoints, indices[i], pivot) >= mid_point) { + index_buffer[num_indices - (1u + atomicAdd(&upper_idx, 1u))] = + indices[i]; + } else { + index_buffer[atomicAdd(&lower_idx, 1u)] = indices[i]; + } + } + + block.sync(); + + /* + * Next, move the spacepoints from the partitioned buffer back into the + * main index array. + */ + for (uint32_t i = block.thread_rank(); i < num_indices; + i += block.size()) { + indices[i] = index_buffer[i]; + } + + block.sync(); + + /* + * Finally, set the beginning and end of our children, as well as the + * splitting point and the current node. + */ + if (block.thread_rank() == 0) { + lhc.incomplete.begin = node.incomplete.begin; + lhc.incomplete.end = lhc.incomplete.begin + lower_idx; + + rhc.incomplete.begin = lhc.incomplete.end; + rhc.incomplete.end = node.incomplete.end; + + mid_point_value = mid_point; + } + + block.sync(); + } else { + /* + * For small ranges, we just get the exact median. We have a helper + * function for this which also sorts the indices as a side-effect, and + * so we do not actually need to do any partitioning. + */ + uint32_t mid_point = + sort_and_get_median(spacepoints, indices, num_indices, pivot); + + /* + * Correctly set the beginning and the of our children, as well as our + * own pivot value. + */ + if (block.thread_rank() == 0) { + lhc.incomplete.begin = node.incomplete.begin; + lhc.incomplete.end = lhc.incomplete.begin + mid_point; + + rhc.incomplete.begin = lhc.incomplete.end; + rhc.incomplete.end = node.incomplete.end; + + mid_point_value = + get_coordinate(spacepoints, indices[mid_point], pivot); + } + + block.sync(); + } + + /* + * Now that the partitioning is complete, we can have our leader thread + * put the dots on the i's and complete this internal node. + */ + if (block.thread_rank() == 0) { + node.type = kd_tree_t::nodetype_e::INTERNAL; + node.internal.dim = pivot; + node.internal.mid = mid_point_value; + + lhc.range = node.range; + rhc.range = node.range; + + /* + * We need to make sure that we update the ranges of our children! + */ + if (node.internal.dim == kd_tree_t::pivot_e::Phi) { + lhc.range.phi_max = mid_point_value; + rhc.range.phi_min = mid_point_value; + } else if (node.internal.dim == kd_tree_t::pivot_e::R) { + lhc.range.r_max = mid_point_value; + rhc.range.r_min = mid_point_value; + } else { + lhc.range.z_max = mid_point_value; + rhc.range.z_min = mid_point_value; + } + } + + /* + * This function also always succeeds. + */ + return true; +} + +/** + * @brief Main k-d tree construction kernel. + * + * A k-d tree is constructed by repeated application of this kernel. This + * kernel uses a one-node-per-block approach. This is not optimal for early + * iteration counts. + * + * @warning This kernel assumes that the tree has already been initialized + * through the preceding kernels. + * + * @param[inout] tree The tree array in which to build. + * @param[in] num_nodes The maximum size of the k-d tree. + * @param[in] spacepoints The spacepoints to encode in the tree. + * @param[inout] _indices The index array to use. + * @param[inout] _index_buffer Temporary buffer space for indices. + * @param[in] iteration The iteration count. + */ +__global__ void construct_kd_tree_step( + kd_tree_t* __restrict__ tree, uint32_t num_nodes, + const internal_spacepoint* const __restrict__ spacepoints, + uint32_t* __restrict__ _indices, uint32_t* __restrict__ _index_buffer, + uint32_t iteration) { + cooperative_groups::thread_block block = + cooperative_groups::this_thread_block(); + + /* + * Calculate the current node ID. Remember that we start with 1 node + * indexed at [0], then 2 nodes at [1, 2], then 4 nodes at [3, 4, 5, 6], + * etc. Thus, the subarray for zero-indexed iteration i starts at 2^i - 1. + */ + int nid = ((1 << iteration) - 1) + blockIdx.x; + + if (nid >= num_nodes) { + return; + } + + /* + * Get the current node using the node ID. + */ + kd_tree_t& node = tree[nid]; + + /* + * Useful assertion for debugging, to ensure that this kernel only ever + * encounters incomplete or non-extant nodes, as it should. + */ + [[maybe_unused]] bool is_correct_node_type = + node.type == kd_tree_t::nodetype_e::INCOMPLETE || + node.type == kd_tree_t::nodetype_e::NON_EXTANT; + + assert(is_correct_node_type); + + /* + * Note that this is not equivalent per se to the above, and that this is + * not an error. But it does indicate that there is no work to be done. + */ + if (node.type != kd_tree_t::nodetype_e::INCOMPLETE) { + return; + } + + bool build_leaf = node.incomplete.end - node.incomplete.begin <= + kd_tree_t::MAX_NODES_PER_LEAF; + + block.sync(); + + /* + * Check to see whether we could fit the range assigned to this block into + * a leaf node. If so, we construct one. Otherwise, we construct an + * internal node. + */ + if (build_leaf) { + [[maybe_unused]] bool r = create_leaf_node( + tree, spacepoints, &_indices[node.incomplete.begin], nid); + + assert(r); + } else { + [[maybe_unused]] bool r = false; + + typename kd_tree_t::pivot_e pivot; + + /* + * If we are the root node, our pivot axis is φ; otherwise, it's what- + * ever is next after our parent's pivot axis. + */ + if (nid == 0) { + pivot = kd_tree_t::pivot_e::Phi; + } else { + pivot = next_pivot(tree[(nid - 1) / 2].internal.dim); + } + + /* + * Try to create an internal node with the given pivot. + */ + r = create_internal_node( + tree, spacepoints, &_indices[node.incomplete.begin], + &_index_buffer[node.incomplete.begin], nid, pivot); + + /* + * Check whether the process succeeded. It should. + */ + assert(r); + } +} + +uint32_t round_to_power_2(uint32_t i) { + uint32_t power = 1u; + + while (power < i) { + power *= 2u; + } + + return power; +} + +std::pair, uint32_t> create_kd_tree( + vecmem::memory_resource& mr, + const internal_spacepoint* const spacepoints, uint32_t n_sp) { + /* + * Allocate space for the indices. Since each spacepoint can produce at + * most two indices, we allocate enough space to support this theoretical + * upper bound. + */ + vecmem::unique_alloc_ptr indices = + vecmem::make_unique_alloc(mr, 2 * n_sp); + + /* + * Allocate an on-device counter for the number of halo points we have, + * which starts out at zero. + */ + vecmem::unique_alloc_ptr extra_indices = + vecmem::make_unique_alloc(mr); + + CUDA_ERROR_CHECK(cudaMemset(extra_indices.get(), 0, sizeof(uint32_t))); + + /* + * Launch the index initialization kernel, which turns n spacepoints into + * anywhere between n and 2n indices in the index array. + */ + std::size_t threads_per_block1 = 256; + initialize_index_vector<<<(n_sp / threads_per_block1) + + (n_sp % threads_per_block1 == 0 ? 0 : 1), + threads_per_block1>>>(spacepoints, indices.get(), + n_sp, extra_indices.get()); + + CUDA_ERROR_CHECK(cudaGetLastError()); + CUDA_ERROR_CHECK(cudaDeviceSynchronize()); + + /* + * Retrieve the number of halo points from the device. + */ + uint32_t extra_indices_h; + + CUDA_ERROR_CHECK(cudaMemcpy(&extra_indices_h, extra_indices.get(), + sizeof(uint32_t), cudaMemcpyDeviceToHost)); + + /* + * The total number of indices is equal to the number of spacepoints plus + * the number of halo points. + */ + uint32_t num_indices = n_sp + extra_indices_h; + + /* + * In theory, the number of nodes that is needed to support N points given + * K points per node is 2^⌈log_2⌈N / ⌊K / 2⌋⌉ + 1⌉ - 1. In practice, we + * build potentially _very_ unbalanced trees. Thus, we need a bit more + * space. + */ + uint32_t largest_layer = round_to_power_2(num_indices) / 4; + uint32_t num_nodes = 4 * largest_layer - 1; + + /* + * Allocate space in which the k-d tree will live. + */ + vecmem::unique_alloc_ptr kd_tree_device = + vecmem::make_unique_alloc(mr, num_nodes); + + /* + * Run the initialization kernel for the tree itself, which sets the + * initial types of the nodes. + */ + std::size_t threads_per_block2 = 256; + initialize_kd_tree<<<(num_nodes / threads_per_block2) + + (num_nodes % threads_per_block2 == 0 ? 0 : 1), + threads_per_block2>>>(kd_tree_device.get(), num_nodes, + num_indices); + + CUDA_ERROR_CHECK(cudaGetLastError()); + CUDA_ERROR_CHECK(cudaDeviceSynchronize()); + + /* + * Allocate some temporary space for the index buffer, one half for "lower" + * points, and one for "upper" points, defined relative to some pivot + * point. Multiple nodes may use this buffer to do their partitions at the + * same time. + */ + vecmem::unique_alloc_ptr index_buffer = + vecmem::make_unique_alloc(mr, num_indices); + + /* + * Repeatedly apply the k-d tree construction kernel. The iteration width + * is the number of nodes that is processed in parallel, and is simply 2^i + * where i is the iteration. We can stop running kernels after we cover the + * entire last level of the tree. + */ + for (uint32_t iteration = 0; (1u << iteration) <= largest_layer; + ++iteration) { + uint32_t width = 1u << iteration; + uint32_t items_per_node = largest_layer / width; + uint32_t threads_per_block = std::clamp(items_per_node, 32u, 256u); + + construct_kd_tree_step<<>>( + kd_tree_device.get(), num_nodes, spacepoints, indices.get(), + index_buffer.get(), iteration); + + CUDA_ERROR_CHECK(cudaGetLastError()); + CUDA_ERROR_CHECK(cudaDeviceSynchronize()); + } + + return {std::move(kd_tree_device), num_nodes}; +} +} // namespace traccc::cuda diff --git a/device/cuda/src/seeding2/kernels/seed_finding_kernel.cu b/device/cuda/src/seeding2/kernels/seed_finding_kernel.cu new file mode 100644 index 0000000000..61a2ed0570 --- /dev/null +++ b/device/cuda/src/seeding2/kernels/seed_finding_kernel.cu @@ -0,0 +1,715 @@ +/** TRACCC library, part of the ACTS project (R&D line) + * + * (c) 2022 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define THREADS_PER_BLOCK 32u +#define MAX_LOWER_SP_PER_MIDDLE 200u +#define MAX_UPPER_SP_PER_MIDDLE 200u +#define KD_TREE_TRAVERSAL_STACK_SIZE 256u + +/** + * @brief Maximum difference in φ between two spacepoints adjacent in the same + * seed. + */ +static constexpr float MAX_DELTA_PHI = 0.06f; + +namespace traccc::cuda { +/** + * @brief Determine whether two spacepoints are a valid pair. + * + * @param[in] finder_conf The seed finder configuration to se. + * @param[in] lo The lower spacepoint. + * @param[in] hi The higher spacepoint. + * + * @return True iff the pair is valid. + */ +__device__ bool is_valid_pair(const seedfinder_config finder_conf, + const internal_spacepoint& lo, + const internal_spacepoint& hi) { + float deltaR = hi.radius() - lo.radius(); + + if (deltaR > finder_conf.deltaRMax) { + return false; + } + + if (deltaR < finder_conf.deltaRMin) { + return false; + } + + float cot_theta = (hi.z() - lo.z()) / deltaR; + + if (fabs(cot_theta) > finder_conf.cotThetaMax) { + return false; + } + + float z_origin = lo.z() - lo.radius() * cot_theta; + + if (z_origin < finder_conf.collisionRegionMin || + z_origin > finder_conf.collisionRegionMax) { + return false; + } + + return true; +} + +/** + * @brief Retrieve values from a k-d tree in a given range and write them to + * an output array. + * + * Performs depth-first traversal of the k-d tree, looking for candidate + * spacepoints. + * + * @param[in] input Data packet for the seeding algorithm. + * @param[in] range 3D range in which to search. + * @param[in] mi_idx Index of the middle spacepoint. + * @param[in] upper True iff we're looking for upper spacepoints. + * @param[out] output_arr Output array to write candidates to. + * @param[out] output_cnt Total number of output candidates. + */ +__device__ void retrieve_from_tree(const seed_finding_data_t& input, + const range3d range, uint32_t mi_idx, + bool upper, + uint32_t* __restrict__ output_arr, + uint32_t* __restrict__ output_cnt) { + __shared__ uint32_t stack[KD_TREE_TRAVERSAL_STACK_SIZE]; + __shared__ uint32_t stack_idx; + + cooperative_groups::thread_block block = + cooperative_groups::this_thread_block(); + + __shared__ uint32_t leaves[256], leaf_num; + + /* + * Retrieve the current spacepoint. + */ + const internal_spacepoint mi = input.spacepoints[mi_idx]; + + /* + * The leader thread initializes the DFS stack by pushing the root node to + * it. + */ + if (block.thread_rank() == 0) { + leaf_num = 0; + stack_idx = 0; + } + + block.sync(); + + uint32_t idx = block.size() - 1 + block.thread_rank(); + + assert(input.tree[idx].type == kd_tree_t::nodetype_e::INTERNAL); + + if (input.tree[idx].range.intersects(range)) { + stack[atomicAdd(&stack_idx, 1u)] = idx; + } + + block.sync(); + + /* + * The traversal runs until the stack is empty. Or until the entire kernel + * explodes and crashes! + */ + while (stack_idx > 0) { + /* + * We perform this traversal in parallel, so multiple threads pop a + * value from the stack at the same time. Of course, this can only + * happen if the thread index fits within the current size of the + * stack. + */ + if (block.thread_rank() < stack_idx) { + uint32_t current = stack[atomicSub(&stack_idx, 1u) - 1]; + + const kd_tree_t node = input.tree[current]; + + /* + * The node must be a leaf node or an internal node, otherwise we + * cannot (safely) traverse it. + */ + assert(node.type == kd_tree_t::nodetype_e::LEAF || + node.type == kd_tree_t::nodetype_e::INTERNAL); + + /* + * Choose the appropriate code path based on whether the node is a + * leaf or an internal node. + */ + if (node.type == kd_tree_t::nodetype_e::LEAF) { + leaves[atomicAdd(&leaf_num, 1u)] = current; + } else if (node.type == kd_tree_t::nodetype_e::INTERNAL && + node.range.intersects(range)) { + /* + * If we are an internal node and we intersect the search + * range, we place both children on the stack. Only the leader + * thread does this. + */ + uint32_t idx = atomicAdd(&stack_idx, 2u); + assert(idx + 1 < KD_TREE_TRAVERSAL_STACK_SIZE); + + stack[idx] = 2 * current + 1; + stack[idx + 1] = 2 * current + 2; + } + } + + /* + * Enforce synchronization of the block between DFS steps. + */ + block.sync(); + } + + for (uint32_t i = block.thread_rank(); + i < kd_tree_t::MAX_NODES_PER_LEAF * leaf_num; i += block.size()) { + uint32_t node_id = i / kd_tree_t::MAX_NODES_PER_LEAF; + uint32_t point_id = i % kd_tree_t::MAX_NODES_PER_LEAF; + + const kd_tree_t& node = input.tree[leaves[node_id]]; + + /* + * For leaf nodes, the tile performs a parallel process in + * which candidate points are stored in the k-d tree. + */ + const kd_tree_t::element_wrapper_t p = node.leaf.points[point_id]; + + if (range.contains(p.phi, p.r, p.z) && p.value != mi_idx) { + const internal_spacepoint o = + input.spacepoints[p.value]; + + /* + * Ensure that the point is actually valid using cuts which + * are more specific than those encoded in the search + * range. + */ + if (is_valid_pair(input.finder_conf, upper ? mi : o, + upper ? o : mi)) { + /* + * Reserve a spot in the output for this point, and + * then store it if it would not exceed the output + * array size. + */ + uint32_t idx = atomicAdd(output_cnt, 1u); + + assert((upper && idx < MAX_UPPER_SP_PER_MIDDLE) || + (!upper && idx < MAX_LOWER_SP_PER_MIDDLE)); + + output_arr[idx] = p.value; + } + } + } + + block.sync(); +} + +/** + * @brief Create an internal seed from three spacepoints. + * + * @param[in] input Data packet with seed finding data. + * @param[in] spacepoints The array of spacepoints. + * @param[in] lower The index of the lower spacepoint. + * @param[in] middle The index of the middle spacepoint. + * @param[in] upper The index of the upper spacepoint. + * + * @return An internal seed iff the triplet is valid, otherwise an invalid + * seed. + */ +__device__ internal_seed +make_seed(const seed_finding_data_t& input, + const internal_spacepoint* const spacepoints, + uint32_t lower, uint32_t middle, uint32_t upper) { + const internal_spacepoint lo = spacepoints[lower]; + const internal_spacepoint mi = spacepoints[middle]; + const internal_spacepoint hi = spacepoints[upper]; + + /* + * Find the lin-circles for the bottom and top pair. + */ + lin_circle lm = doublet_finding_helper::transform_coordinates(mi, lo, true); + lin_circle mh = + doublet_finding_helper::transform_coordinates(mi, hi, false); + + scalar iSinTheta2 = 1 + lm.cotTheta() * lm.cotTheta(); + scalar scatteringInRegion2 = + input.finder_conf.maxScatteringAngle2 * iSinTheta2; + scatteringInRegion2 *= + input.finder_conf.sigmaScattering * input.finder_conf.sigmaScattering; + scalar curvature, impact_parameter; + + /* + * Borrow the compatibility code from the existing seed finding code. + */ + if (triplet_finding_helper::isCompatible(mi, lm, mh, input.finder_conf, + iSinTheta2, scatteringInRegion2, + curvature, impact_parameter)) { + internal_seed r; + + r.spacepoints[0] = lower; + r.spacepoints[1] = middle; + r.spacepoints[2] = upper; + + /* + * Add a weight based on the impact parameter to the seed. + */ + r.weight = -impact_parameter * input.filter_conf.impactWeightFactor; + + return r; + } else { + /* + * If the triplet is invalid, return a bogus seed. + */ + return internal_seed::Invalid(); + } +} + +/** + * @brief Get the basic Range3D object + * + * This contains only the most basic cuts, but is designed to be refined later. + * + * @param[in] finder_conf The seed finder configuration to use. + * + * @return A 3D range object. + */ +__device__ range3d get_basic_range3d(const seedfinder_config finder_conf) { + range3d r = range3d::Infinite(); + + r.r_min = std::max(r.r_min, 0.f); + r.r_max = std::min(r.r_max, finder_conf.rMax); + + r.z_min = std::max(r.z_min, finder_conf.zMin); + r.z_max = std::min(r.z_max, finder_conf.zMax); + + return r; +} + +/** + * @brief Return the search range for a lower spacepoint given a middle + * spacepoint. + * + * @param[in] finder_conf The seed finder configuration to use. + * @param[in] s The middle spacepoint. + * + * @return A three-dimensional search range. + */ +__device__ range3d get_lower_range3d(const seedfinder_config finder_conf, + const internal_spacepoint& s) { + range3d r = get_basic_range3d(finder_conf); + + r.r_min = std::max(r.r_min, s.radius() - finder_conf.deltaRMax); + r.r_max = std::min(r.r_max, s.radius() - finder_conf.deltaRMin); + + float frac_r = r.r_min / s.radius(); + + float z_min2 = (s.z() - finder_conf.collisionRegionMin) * frac_r + + finder_conf.collisionRegionMin; + float z_max2 = (s.z() - finder_conf.collisionRegionMax) * frac_r + + finder_conf.collisionRegionMax; + + r.z_min = std::max(r.z_min, std::min(z_min2, s.z())); + r.z_max = std::min(r.z_max, std::max(z_max2, s.z())); + + r.z_min = std::max(r.z_min, s.z() - 450.f); + r.z_max = std::min(r.z_max, s.z() + 450.f); + + r.phi_min = std::max(r.phi_min, s.phi() - MAX_DELTA_PHI); + r.phi_max = std::min(r.phi_max, s.phi() + MAX_DELTA_PHI); + + return r; +} + +/** + * @brief Return the search range for an upper spacepoint given a middle + * spacepoint. + * + * @param[in] finder_conf The seed finder configuration to use. + * @param[in] s The middle spacepoint. + * + * @return A three-dimensional search range. + */ +__device__ range3d get_upper_range3d(const seedfinder_config finder_conf, + const internal_spacepoint& s) { + range3d r = get_basic_range3d(finder_conf); + + r.r_min = std::max(r.r_min, s.radius() + finder_conf.deltaRMin); + r.r_max = std::min(r.r_max, s.radius() + finder_conf.deltaRMax); + + float z_max2 = + (r.r_max / s.radius()) * (s.z() - finder_conf.collisionRegionMin) + + finder_conf.collisionRegionMin; + float z_min2 = + finder_conf.collisionRegionMax - + (r.r_max / s.radius()) * (finder_conf.collisionRegionMax - s.z()); + + if (s.z() > finder_conf.collisionRegionMin) { + r.z_max = std::min(r.z_max, z_max2); + } else if (s.z() < finder_conf.collisionRegionMax) { + r.z_min = std::max(r.z_min, z_min2); + } + + r.z_min = std::max( + r.z_min, s.z() - finder_conf.cotThetaMax * (r.r_max - s.radius())); + r.z_max = std::min( + r.z_max, s.z() + finder_conf.cotThetaMax * (r.r_max - s.radius())); + + r.z_min = std::max(r.z_min, s.z() - 450.f); + r.z_max = std::min(r.z_max, s.z() + 450.f); + + r.phi_min = std::max(r.phi_min, s.phi() - MAX_DELTA_PHI); + r.phi_max = std::min(r.phi_max, s.phi() + MAX_DELTA_PHI); + + return r; +} + +/** + * @brief Seed finding helper function. + * + * Turns out this function actually does most of the important work. + * + * @param[in] input Data parcel for seed finding configuration. + * @param[inout] internal_seeds The array to write internal seeds to. + * @param[in] increasing_z True iff we are looking for increasing-z seeds. + */ +__device__ void run_helper(const seed_finding_data_t& input, + internal_seed* internal_seeds, bool increasing_z) { + cooperative_groups::thread_block block = + cooperative_groups::this_thread_block(); + + /* + * Allocate shared memory for the lower and upper candidates. + */ + __shared__ uint32_t lower_sps[MAX_LOWER_SP_PER_MIDDLE]; + __shared__ uint32_t upper_sps[MAX_UPPER_SP_PER_MIDDLE]; + + /* + * The leader thread initializes the candidate counts to zero. + */ + __shared__ uint32_t num_lower, num_upper; + + if (block.thread_rank() == 0) { + num_lower = 0; + num_upper = 0; + } + + block.sync(); + + internal_spacepoint sp = + input.spacepoints[block.group_index().x]; + + /* + * Retrieve the search range for the lower spacepoints, then adjust it for + * monotinicity in z. + */ + range3d lower_range3d = get_lower_range3d(input.finder_conf, sp); + + if (increasing_z) { + lower_range3d.z_max = std::min(lower_range3d.z_max, sp.z()); + } else { + lower_range3d.z_min = std::max(lower_range3d.z_min, sp.z()); + } + + /* + * Same as above, but for the upper spacepoint. Note that some of the + * operations are reversed. + */ + range3d upper_range3d = get_upper_range3d(input.finder_conf, sp); + + if (increasing_z) { + upper_range3d.z_min = std::max(lower_range3d.z_min, sp.z()); + } else { + upper_range3d.z_max = std::min(lower_range3d.z_max, sp.z()); + } + + /* + * If either of the search ranges is degenerate, we can never find any + * candidate seeds. So we can safely exit. + */ + if (lower_range3d.phi_min >= lower_range3d.phi_max || + lower_range3d.r_min >= lower_range3d.r_max || + lower_range3d.z_min >= lower_range3d.z_max || + upper_range3d.phi_min >= upper_range3d.phi_max || + upper_range3d.r_min >= upper_range3d.r_max || + upper_range3d.z_min >= upper_range3d.z_max) { + return; + } + + /* + * Retrieve the lower and upper spacepoint candidates from the k-d tree. + */ + retrieve_from_tree(input, lower_range3d, block.group_index().x, false, + lower_sps, &num_lower); + + block.sync(); + + /* + * If we have zero lower candidates, we might as well quit; we will never + * have any candidates! + */ + if (num_lower == 0) { + return; + } + + retrieve_from_tree(input, upper_range3d, block.group_index().x, true, + upper_sps, &num_upper); + + block.sync(); + + /* + * Life fast and die young! (It's an early quitting condition) + */ + if (num_upper == 0) { + return; + } + + /* + * Calculate the total number of iterations, but take into account the + * maximum size of the lower and upper spacepoint storage arrays. + */ + uint32_t combinations = std::min(num_lower, MAX_LOWER_SP_PER_MIDDLE) * + std::min(num_upper, MAX_UPPER_SP_PER_MIDDLE); + + /* + * The iteration count is rounded up to a multiple of the block size to + * ensure that none of the threads slack off and exit prematurely. + */ + uint32_t iter_count = + combinations + (combinations % block.size() != 0 + ? block.size() - (combinations % block.size()) + : 0); + + for (uint32_t i = block.thread_rank(); i < iter_count; i += block.size()) { + uint32_t lower_id = i / num_upper; + uint32_t upper_id = i % num_upper; + + /* + * Consider the current combination and turn it into a seed. This seed + * may be bogus! But that is okay, because the sorting step below will + * automatically separate bogus seeds. + */ + if (lower_id < std::min(num_lower, MAX_LOWER_SP_PER_MIDDLE) && + upper_id < std::min(num_upper, MAX_UPPER_SP_PER_MIDDLE)) { + internal_seeds[block.thread_rank()] = + make_seed(input, input.spacepoints, lower_sps[lower_id], + block.group_index().x, upper_sps[upper_id]); + } + + block.sync(); + + bool sorted; + + /* + * Perform odd-even sorting, making sure that the seeds with the highest + * weight float to the top of the seed array, which is the part that + * will be written to the global output. + */ + do { + sorted = true; + + /* + * Odd step. + */ + for (uint32_t j = 2 * block.thread_rank() + 1; + j < block.size() + input.finder_conf.maxSeedsPerSpM - 1; + j += 2 * block.size()) { + bool swap = + internal_seeds[j].weight > internal_seeds[j + 1].weight; + internal_seed e = internal_seeds[j + 1]; + + if (swap) { + internal_seeds[j + 1] = internal_seeds[j]; + internal_seeds[j] = e; + sorted = false; + } + } + + block.sync(); + + /* + * Even step. + */ + for (uint32_t j = 2 * block.thread_rank(); + j < block.size() + input.finder_conf.maxSeedsPerSpM - 1; + j += 2 * block.size()) { + bool swap = + internal_seeds[j].weight > internal_seeds[j + 1].weight; + internal_seed e = internal_seeds[j + 1]; + + if (swap) { + internal_seeds[j + 1] = internal_seeds[j]; + internal_seeds[j] = e; + sorted = false; + } + } + } while (__syncthreads_or(!sorted)); + } +} + +/** + * @brief Main seed finding kernel. + * + * @note This kernel assumes exactly one warp per block and operates in a + * one-block-per-middle-SP fashion. + * + */ +__global__ void __launch_bounds__(32) + seed_finding_kernel(const seed_finding_data_t input, + internal_seed* output_seeds, + uint32_t* output_seed_size) { + cooperative_groups::thread_block block = + cooperative_groups::this_thread_block(); + + /* + * The seed finding configuration defines a few criteria that allow us to + * ignore some of the middle spacepoint candidates. + */ + if (input.spacepoints[block.group_index().x].phi() < + input.finder_conf.phiMin || + input.spacepoints[block.group_index().x].phi() > + input.finder_conf.phiMax || + input.spacepoints[block.group_index().x].radius() < + input.finder_conf.rMin || + input.spacepoints[block.group_index().x].radius() > + input.finder_conf.rMax || + input.spacepoints[block.group_index().x].z() < input.finder_conf.zMin || + input.spacepoints[block.group_index().x].z() > input.finder_conf.zMax) { + return; + } + + /* + * The internal seeds are stored in this shared array. + */ + extern __shared__ internal_seed internal_seeds[]; + + /* + * Initialize the internal seed array to contain only bogus seeds. + */ + for (uint32_t i = block.thread_rank(); + i < block.size() + input.finder_conf.maxSeedsPerSpM; + i += block.size()) { + internal_seeds[i].weight = std::numeric_limits::lowest(); + } + + block.sync(); + + /* + * Run the actual seed finding twice; once with monotonically increasing + * z values, and once with monotonically decreasing ones. + */ + run_helper(input, internal_seeds, true); + run_helper(input, internal_seeds, false); + + /* + * We have now completed the seeding algorithm, and we finish by writing + * the seeds to our output array. This task is reserved for the leader + * thread only, and all the other threads are idle. Thankfully, this should + * not take much time at all. + */ + if (block.thread_rank() == 0) { + /* + * First, we count the number of valid seeds that we will want to write + * to the global memory. + */ + uint32_t num_valid = 0; + + for (uint32_t i = 0; i < input.finder_conf.maxSeedsPerSpM; ++i) { + if (internal_seeds[block.size() + i].weight > -1000000.0f) { + ++num_valid; + } + } + + /* + * Next, reserve space atomically in the output array for our new + * seeds. + */ + uint32_t idx = atomicAdd(output_seed_size, num_valid); + + /* + * Finally, actually write the output to global memory. Again, this is + * done sequentially by the leader thread, but since the number of + * seeds is so small it should be fine. Notice that we count down from + * the end of the array, because valid seeds are always sorted higher + * than invalid ones (if we have any). + */ + for (uint32_t i = 0; i < num_valid; ++i) { + output_seeds[idx + i] = + internal_seeds[block.size() + input.finder_conf.maxSeedsPerSpM - + (i + 1)]; + } + } +} + +/** + * @brief Seeding entry point for C++ code. + * + * This is basically a wrapper for the seed finding kernel. + * + * @param[in] finder_conf The seed finder configuration to use. + * @param[in] filter_conf The seed filter configuration to use. + * @param[inout] mr The memory resource for allocations. + * @param[in] spacepoints The spacepoint array. + * @param[in] n_sp The total number of spacepoints. + * @param[in] tree The k-d tree to use for searching. + * @param[in] tree_size The total size of the k-d tree. + * + * @return A unique pointer to an array of internal seeds, and the size of that + * array. + */ +std::pair, uint32_t> run_seeding( + seedfinder_config finder_conf, seedfilter_config filter_conf, + vecmem::memory_resource& mr, + const internal_spacepoint* const spacepoints, uint32_t n_sp, + vecmem::unique_alloc_ptr&& tree, uint32_t tree_size) { + /* + * Allocate space for output of seeds on the device. + */ + vecmem::unique_alloc_ptr seeds_device = + vecmem::make_unique_alloc( + mr, finder_conf.maxSeedsPerSpM * n_sp); + + /* + * Allocate space for seed count on the device. + */ + vecmem::unique_alloc_ptr seed_count_device = + vecmem::make_unique_alloc(mr); + CUDA_ERROR_CHECK(cudaMemset(seed_count_device.get(), 0, sizeof(uint32_t))); + + /* + * Calculate the total amount of shared memory on top of that which is + * statically defined in the CUDA kernels. This is equal to one seed for + * each block plus one seed for each output seed. + */ + std::size_t total_shared_memory = + (THREADS_PER_BLOCK + finder_conf.maxSeedsPerSpM) * + sizeof(internal_seed); + + /* + * Call the kernel and all that jazz. + */ + seed_finding_kernel<<>>( + {finder_conf, filter_conf, spacepoints, tree.get(), tree_size}, + seeds_device.get(), seed_count_device.get()); + + CUDA_ERROR_CHECK(cudaGetLastError()); + CUDA_ERROR_CHECK(cudaDeviceSynchronize()); + + /* + * Transfer the seed count back to the host and then hand it to the user. + */ + uint32_t seed_count_host; + CUDA_ERROR_CHECK(cudaMemcpy(&seed_count_host, seed_count_device.get(), + sizeof(uint32_t), cudaMemcpyDeviceToHost)); + + return {std::move(seeds_device), seed_count_host}; +} +} // namespace traccc::cuda diff --git a/device/cuda/src/seeding2/kernels/write_output_kernel.cu b/device/cuda/src/seeding2/kernels/write_output_kernel.cu new file mode 100644 index 0000000000..29a159abd2 --- /dev/null +++ b/device/cuda/src/seeding2/kernels/write_output_kernel.cu @@ -0,0 +1,108 @@ +/** TRACCC library, part of the ACTS project (R&D line) + * + * (c) 2022 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace traccc::cuda { +/** + * @brief Main output writing kernel. + * + * @param[in] n_seeds The number of seeds to write. + * @param[in] spacepoints Spacepoint array into which seeds are indexed. + * @param[in] in_seeds Input internal seeds to read from. + * @param[out] out_seeds Output seeds to write to. + */ +__global__ void write_output_kernel( + uint32_t n_seeds, const internal_spacepoint* const spacepoints, + const internal_seed* const in_seeds, + vecmem::data::vector_view out_seeds) { + cooperative_groups::grid_group grid = cooperative_groups::this_grid(); + + /* + * Create a writable device vector for the seeds. + */ + vecmem::device_vector dvec(out_seeds); + + int i = grid.thread_rank(); + + if (i < n_seeds) { + /* + * Get the output seed for the current thread. + */ + seed& s = dvec.at(i); + + /* + * Write the data back to the output. + */ + s.spB_link.first = spacepoints[in_seeds[i].spacepoints[0]].m_link.first; + s.spB_link.second = + spacepoints[in_seeds[i].spacepoints[0]].m_link.second; + s.spM_link.first = spacepoints[in_seeds[i].spacepoints[1]].m_link.first; + s.spM_link.second = + spacepoints[in_seeds[i].spacepoints[1]].m_link.second; + s.spT_link.first = spacepoints[in_seeds[i].spacepoints[2]].m_link.first; + s.spT_link.second = + spacepoints[in_seeds[i].spacepoints[2]].m_link.second; + + s.weight = in_seeds[i].weight; + s.z_vertex = 0.f; + } +} + +vecmem::data::vector_buffer write_output( + const traccc::memory_resource& mr, uint32_t n_seeds, + const internal_spacepoint* const spacepoints, + const internal_seed* const seeds) { + static constexpr std::size_t threads_per_block = 256; + + /* + * Create a "vector buffer" that we can write our output to. + */ + vecmem::data::vector_buffer out(n_seeds, mr.main); + + /* + * Fetch the current CUDA device number. + */ + int device; + CUDA_ERROR_CHECK(cudaGetDevice(&device)); + + /* + * Instruct CUDA to prefetch the managed memory of the output to the + * device, to minimize the number of page faults. + * + * TODO: This needs a check to ensure that the memory is actually managed + * memory. + */ + CUDA_ERROR_CHECK( + cudaMemPrefetchAsync(out.ptr(), n_seeds * sizeof(seed), device)); + + /* + * Execute the main output kernel. + */ + write_output_kernel<<<(n_seeds / threads_per_block) + + (n_seeds % threads_per_block == 0 ? 0 : 1), + threads_per_block>>>(n_seeds, spacepoints, seeds, + out); + + CUDA_ERROR_CHECK(cudaGetLastError()); + CUDA_ERROR_CHECK(cudaDeviceSynchronize()); + + return out; +} +} // namespace traccc::cuda diff --git a/device/cuda/src/seeding2/seed_finding.cu b/device/cuda/src/seeding2/seed_finding.cu new file mode 100644 index 0000000000..5b94864530 --- /dev/null +++ b/device/cuda/src/seeding2/seed_finding.cu @@ -0,0 +1,108 @@ +/** TRACCC library, part of the ACTS project (R&D line) + * + * (c) 2022 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace traccc::cuda { +std::tuple[]>, + uint32_t> +copy_sps_to_device(vecmem::memory_resource& mr, + const spacepoint_container_types::const_view& sps) { + std::chrono::high_resolution_clock::time_point time_ex_start = + std::chrono::high_resolution_clock::now(); + + const traccc::spacepoint_container_types::const_device p(sps); + + std::vector> spacepoints; + + for (std::size_t i = 0; i < p.get_items().size(); ++i) { + for (std::size_t j = 0; j < p.get_items().at(i).size(); ++j) { + const traccc::spacepoint& sp = p.get_items().at(i).at(j); + + spacepoints.push_back( + internal_spacepoint(p, {i, j}, {0.f, 0.f})); + } + } + + uint32_t n_sp = spacepoints.size(); + + // Move the spacepoints to device memory. + vecmem::unique_alloc_ptr[]> + spacepoints_device = + vecmem::make_unique_alloc[]>( + mr, n_sp, spacepoints.data(), + [](internal_spacepoint* dst, + const internal_spacepoint* src, + std::size_t bytes) { + CUDA_ERROR_CHECK( + cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice)); + }); + + return {std::move(spacepoints_device), n_sp}; +} + +seed_finding2::seed_finding2(const traccc::memory_resource& mr) + : m_output_mr(mr), m_finder_conf() { + m_finder_conf.highland = + 13.6 * std::sqrt(m_finder_conf.radLengthPerSeed) * + (1 + 0.038 * std::log(m_finder_conf.radLengthPerSeed)); + float maxScatteringAngle = m_finder_conf.highland / m_finder_conf.minPt; + m_finder_conf.maxScatteringAngle2 = maxScatteringAngle * maxScatteringAngle; + + // helix radius in homogeneous magnetic field. Units are Kilotesla, MeV + // and millimeter + // TODO: change using ACTS units + m_finder_conf.pTPerHelixRadius = 300. * m_finder_conf.bFieldInZ; + m_finder_conf.minHelixDiameter2 = + std::pow(m_finder_conf.minPt * 2 / m_finder_conf.pTPerHelixRadius, 2); + m_finder_conf.pT2perRadius = + std::pow(m_finder_conf.highland / m_finder_conf.pTPerHelixRadius, 2); +} + +vecmem::data::vector_buffer seed_finding2::operator()( + const spacepoint_container_types::const_view& sps) const { + vecmem::cuda::device_memory_resource mr; + vecmem::unique_alloc_ptr[]> + spacepoints_device; + uint32_t n_sp; + + std::tie(spacepoints_device, n_sp) = copy_sps_to_device(mr, sps); + + vecmem::unique_alloc_ptr kd_tree_device; + uint32_t kd_tree_size; + + std::tie(kd_tree_device, kd_tree_size) = + create_kd_tree(mr, spacepoints_device.get(), n_sp); + + vecmem::unique_alloc_ptr seeds_device; + uint32_t seed_count; + + std::tie(seeds_device, seed_count) = + run_seeding(m_finder_conf, m_filter_conf, mr, spacepoints_device.get(), + n_sp, std::move(kd_tree_device), kd_tree_size); + + return write_output(m_output_mr, seed_count, spacepoints_device.get(), + seeds_device.get()); +} +} // namespace traccc::cuda