From b9218a2779ab47e539ab315d4effe680fe499b9c Mon Sep 17 00:00:00 2001
From: Stephen Nicholas Swatman <stephen.nicholas.swatman@cern.ch>
Date: Tue, 20 Sep 2022 10:17:23 +0200
Subject: [PATCH] New seeding

---
 device/cuda/CMakeLists.txt                    |   5 +
 .../cuda/seeding2/kernels/kd_tree_kernel.hpp  |  24 +
 .../seeding2/kernels/seed_finding_kernel.hpp  |  38 +
 .../seeding2/kernels/write_output_kernel.hpp  |  25 +
 .../traccc/cuda/seeding2/seed_finding.hpp     |  35 +
 .../cuda/seeding2/types/internal_seed.hpp     |  29 +
 .../traccc/cuda/seeding2/types/kd_tree.hpp    |  54 ++
 .../traccc/cuda/seeding2/types/range3d.hpp    |  58 ++
 .../src/seeding2/kernels/kd_tree_kernel.cu    | 807 ++++++++++++++++++
 .../seeding2/kernels/seed_finding_kernel.cu   | 715 ++++++++++++++++
 .../seeding2/kernels/write_output_kernel.cu   | 108 +++
 device/cuda/src/seeding2/seed_finding.cu      | 108 +++
 12 files changed, 2006 insertions(+)
 create mode 100644 device/cuda/include/traccc/cuda/seeding2/kernels/kd_tree_kernel.hpp
 create mode 100644 device/cuda/include/traccc/cuda/seeding2/kernels/seed_finding_kernel.hpp
 create mode 100644 device/cuda/include/traccc/cuda/seeding2/kernels/write_output_kernel.hpp
 create mode 100644 device/cuda/include/traccc/cuda/seeding2/seed_finding.hpp
 create mode 100644 device/cuda/include/traccc/cuda/seeding2/types/internal_seed.hpp
 create mode 100644 device/cuda/include/traccc/cuda/seeding2/types/kd_tree.hpp
 create mode 100644 device/cuda/include/traccc/cuda/seeding2/types/range3d.hpp
 create mode 100644 device/cuda/src/seeding2/kernels/kd_tree_kernel.cu
 create mode 100644 device/cuda/src/seeding2/kernels/seed_finding_kernel.cu
 create mode 100644 device/cuda/src/seeding2/kernels/write_output_kernel.cu
 create mode 100644 device/cuda/src/seeding2/seed_finding.cu
diff --git a/device/cuda/CMakeLists.txt b/device/cuda/CMakeLists.txt
index 74afa34f51..6089b21e27 100644
--- a/device/cuda/CMakeLists.txt
+++ b/device/cuda/CMakeLists.txt
@@ -30,6 +30,11 @@ traccc_add_library( traccc_cuda cuda TYPE SHARED
   # Clusterization
   "include/traccc/cuda/clusterization/clusterization_algorithm.hpp"
   "src/clusterization/clusterization_algorithm.cu"
+  # Alternate seeding.
+  "src/seeding2/seed_finding.cu"
+  "src/seeding2/kernels/seed_finding_kernel.cu"
+  "src/seeding2/kernels/kd_tree_kernel.cu"
+  "src/seeding2/kernels/write_output_kernel.cu"
 )
 target_compile_options( traccc_cuda
   PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr> )
diff --git a/device/cuda/include/traccc/cuda/seeding2/kernels/kd_tree_kernel.hpp b/device/cuda/include/traccc/cuda/seeding2/kernels/kd_tree_kernel.hpp
new file mode 100644
index 0000000000..ed8677ff58
--- /dev/null
+++ b/device/cuda/include/traccc/cuda/seeding2/kernels/kd_tree_kernel.hpp
@@ -0,0 +1,24 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+#include <traccc/cuda/seeding2/types/kd_tree.hpp>
+#include <traccc/edm/internal_spacepoint.hpp>
+#include <traccc/edm/spacepoint.hpp>
+#include <vector>
+
+namespace traccc::cuda {
+/**
+ * @brief Creates a k-d tree from a given set of spacepoints.
+ *
+ * @return A pair containing the k-d tree nodes as well as the number of nodes.
+ */
+std::pair<vecmem::unique_alloc_ptr<kd_tree_t[]>, uint32_t> create_kd_tree(
+    vecmem::memory_resource&, const internal_spacepoint<spacepoint>* const,
+    uint32_t);
+}  // namespace traccc::cuda
diff --git a/device/cuda/include/traccc/cuda/seeding2/kernels/seed_finding_kernel.hpp b/device/cuda/include/traccc/cuda/seeding2/kernels/seed_finding_kernel.hpp
new file mode 100644
index 0000000000..99c795504f
--- /dev/null
+++ b/device/cuda/include/traccc/cuda/seeding2/kernels/seed_finding_kernel.hpp
@@ -0,0 +1,38 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+#include <traccc/cuda/seeding2/types/internal_seed.hpp>
+#include <traccc/cuda/seeding2/types/kd_tree.hpp>
+#include <traccc/edm/internal_spacepoint.hpp>
+#include <traccc/edm/spacepoint.hpp>
+#include <traccc/seeding/detail/seeding_config.hpp>
+
+namespace traccc::cuda {
+/**
+ * @brief Convenience data structure for all the data we need for seeding.
+ */
+struct seed_finding_data_t {
+    const seedfinder_config finder_conf;
+    const seedfilter_config filter_conf;
+    const internal_spacepoint<spacepoint>* spacepoints;
+    const kd_tree_t* tree;
+    const uint32_t tree_nodes;
+};
+
+/**
+ * @brief Execute the seed finding kernel itself.
+ *
+ * @return A pair containing the list of internal seeds as well as the number
+ * of seeds.
+ */
+std::pair<vecmem::unique_alloc_ptr<internal_seed[]>, uint32_t> run_seeding(
+    seedfinder_config, seedfilter_config, vecmem::memory_resource&,
+    const internal_spacepoint<spacepoint>* const, uint32_t,
+    vecmem::unique_alloc_ptr<kd_tree_t[]>&&, uint32_t);
+}  // namespace traccc::cuda
diff --git a/device/cuda/include/traccc/cuda/seeding2/kernels/write_output_kernel.hpp b/device/cuda/include/traccc/cuda/seeding2/kernels/write_output_kernel.hpp
new file mode 100644
index 0000000000..4e28edcb54
--- /dev/null
+++ b/device/cuda/include/traccc/cuda/seeding2/kernels/write_output_kernel.hpp
@@ -0,0 +1,25 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+#include <traccc/cuda/seeding2/types/internal_seed.hpp>
+#include <traccc/edm/internal_spacepoint.hpp>
+#include <traccc/edm/seed.hpp>
+#include <traccc/edm/spacepoint.hpp>
+#include <traccc/utils/memory_resource.hpp>
+
+namespace traccc::cuda {
+/**
+ * @brief Kernel to write output data back into traccc's EDM.
+ *
+ * @return A vector buffer containing the output seeds.
+ */
+vecmem::data::vector_buffer<seed> write_output(
+    const traccc::memory_resource &, uint32_t,
+    const internal_spacepoint<spacepoint> *const, const internal_seed *const);
+}  // namespace traccc::cuda
diff --git a/device/cuda/include/traccc/cuda/seeding2/seed_finding.hpp b/device/cuda/include/traccc/cuda/seeding2/seed_finding.hpp
new file mode 100644
index 0000000000..17bc9e3110
--- /dev/null
+++ b/device/cuda/include/traccc/cuda/seeding2/seed_finding.hpp
@@ -0,0 +1,35 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+#include <traccc/edm/seed.hpp>
+#include <traccc/edm/spacepoint.hpp>
+#include <traccc/seeding/detail/seeding_config.hpp>
+#include <traccc/utils/algorithm.hpp>
+#include <traccc/utils/memory_resource.hpp>
+#include <vecmem/memory/cuda/device_memory_resource.hpp>
+
+namespace traccc::cuda {
+/**
+ * @brief Alternative seed finding algorithm, using orthogonal range search
+ * implemented through a k-d tree.
+ */
+class seed_finding2 : public algorithm<vecmem::data::vector_buffer<seed>(
+                          const spacepoint_container_types::const_view&)> {
+    public:
+    seed_finding2(const traccc::memory_resource& mr);
+
+    vecmem::data::vector_buffer<seed> operator()(
+        const spacepoint_container_types::const_view& sps) const override;
+
+    private:
+    traccc::memory_resource m_output_mr;
+    seedfinder_config m_finder_conf;
+    seedfilter_config m_filter_conf;
+};
+}  // namespace traccc::cuda
diff --git a/device/cuda/include/traccc/cuda/seeding2/types/internal_seed.hpp b/device/cuda/include/traccc/cuda/seeding2/types/internal_seed.hpp
new file mode 100644
index 0000000000..e26919a39b
--- /dev/null
+++ b/device/cuda/include/traccc/cuda/seeding2/types/internal_seed.hpp
@@ -0,0 +1,29 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+
+namespace traccc::cuda {
+/**
+ * @brief Internal structure representing a single seed.
+ */
+struct internal_seed {
+    std::array<uint32_t, 3> spacepoints;
+    float weight;
+
+    __host__ __device__ static internal_seed Invalid() {
+        internal_seed r;
+
+        r.weight = std::numeric_limits<float>::lowest();
+
+        return r;
+    }
+};
+}  // namespace traccc::cuda
diff --git a/device/cuda/include/traccc/cuda/seeding2/types/kd_tree.hpp b/device/cuda/include/traccc/cuda/seeding2/types/kd_tree.hpp
new file mode 100644
index 0000000000..7867d87b6d
--- /dev/null
+++ b/device/cuda/include/traccc/cuda/seeding2/types/kd_tree.hpp
@@ -0,0 +1,54 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+#include <traccc/cuda/seeding2/types/range3d.hpp>
+
+namespace traccc::cuda {
+/**
+ * @brief Data structure representing a node in a k-d tree.
+ *
+ * @tparam _value_t Type of values in the tree.
+ * @tparam _nodes_i Number of values in each leaf node.
+ */
+template <typename _value_t, std::size_t _nodes_i>
+struct kd_tree_node {
+    struct element_wrapper_t {
+        _value_t value;
+        float phi, r, z;
+    };
+
+    enum class nodetype_e { LEAF, INTERNAL, NON_EXTANT, INCOMPLETE };
+
+    enum class pivot_e { Phi, R, Z };
+
+    static constexpr std::size_t MAX_NODES_PER_LEAF = _nodes_i;
+
+    nodetype_e type;
+
+    range3d range;
+
+    union {
+        struct {
+            element_wrapper_t points[MAX_NODES_PER_LEAF];
+            uint32_t point_count;
+        } leaf;
+        struct {
+            pivot_e dim;
+            float mid;
+        } internal;
+        struct {
+        } non_extant;
+        struct {
+            uint32_t begin, end;
+        } incomplete;
+    };
+};
+
+using kd_tree_t = kd_tree_node<uint32_t, 8>;
+}  // namespace traccc::cuda
diff --git a/device/cuda/include/traccc/cuda/seeding2/types/range3d.hpp b/device/cuda/include/traccc/cuda/seeding2/types/range3d.hpp
new file mode 100644
index 0000000000..e4bb6172a1
--- /dev/null
+++ b/device/cuda/include/traccc/cuda/seeding2/types/range3d.hpp
@@ -0,0 +1,58 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+
+namespace traccc::cuda {
+/**
+ * @brief Range in three dimensions, defined by minima and maxima in the phi,
+ * r, and z dimensions.
+ */
+struct range3d {
+    float phi_min, phi_max, r_min, r_max, z_min, z_max;
+
+    __host__ __device__ static range3d Infinite() {
+        range3d r;
+
+        r.phi_min = std::numeric_limits<float>::lowest();
+        r.phi_max = std::numeric_limits<float>::max();
+        r.r_min = 0.f;
+        r.r_max = std::numeric_limits<float>::max();
+        r.z_min = std::numeric_limits<float>::lowest();
+        r.z_max = std::numeric_limits<float>::max();
+
+        return r;
+    }
+
+    __host__ __device__ static range3d Degenerate() {
+        range3d r;
+
+        r.phi_min = std::numeric_limits<float>::max();
+        r.phi_max = std::numeric_limits<float>::lowest();
+        r.r_min = std::numeric_limits<float>::max();
+        r.r_max = 0.f;
+        r.z_min = std::numeric_limits<float>::max();
+        r.z_max = std::numeric_limits<float>::lowest();
+
+        return r;
+    }
+
+    __host__ __device__ bool intersects(const range3d& o) const {
+        return phi_min <= o.phi_max && o.phi_min < phi_max &&
+               r_min <= o.r_max && o.r_min < r_max && z_min <= o.z_max &&
+               o.z_min < z_max;
+    }
+
+    __host__ __device__ bool contains(float phi, float r, float z) const {
+        return phi_min <= phi && phi <= phi_max && r_min <= r && r <= r_max &&
+               z_min <= z && z <= z_max;
+    }
+};
+}  // namespace traccc::cuda
diff --git a/device/cuda/src/seeding2/kernels/kd_tree_kernel.cu b/device/cuda/src/seeding2/kernels/kd_tree_kernel.cu
new file mode 100644
index 0000000000..344d0419cc
--- /dev/null
+++ b/device/cuda/src/seeding2/kernels/kd_tree_kernel.cu
@@ -0,0 +1,807 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#include <cooperative_groups.h>
+
+#include <limits>
+#include <memory>
+#include <traccc/cuda/seeding2/kernels/seed_finding_kernel.hpp>
+#include <traccc/cuda/seeding2/seed_finding.hpp>
+#include <traccc/cuda/seeding2/types/kd_tree.hpp>
+#include <traccc/cuda/seeding2/types/range3d.hpp>
+#include <traccc/cuda/utils/definitions.hpp>
+#include <traccc/edm/seed.hpp>
+#include <traccc/edm/spacepoint.hpp>
+#include <traccc/seeding/detail/lin_circle.hpp>
+#include <traccc/seeding/detail/seeding_config.hpp>
+#include <vecmem/memory/memory_resource.hpp>
+#include <vecmem/memory/unique_ptr.hpp>
+#include <vector>
+
+namespace traccc::cuda {
+/**
+ * @brief Bit mask for halo point indices.
+ *
+ * Our phi-axis search range wraps around, and k-d trees do not natively
+ * support this. Thankfully, we can solve this problem by simply duplicating
+ * points at the edges of the phi range. These are called phi halo points. We
+ * mark these points by setting the MSB in their index. This means that, in
+ * principle, the index is invalid, but it also saves quite a lot of space.
+ * This mask contains the exact bit that is set as the mask.
+ */
+constexpr uint32_t AUXILIARY_INDEX_MASK = (1u << 31);
+
+/**
+ * @brief Width of phi-halo range.
+ *
+ * This value defines the range in which points are copied and treated as halo
+ * points. If this width is ε, then points below -π + ε and above π - ε are
+ * halo points. This value should be at least as large as the maximum phi delta
+ * betweel spacepoints.
+ */
+constexpr float PHI_HALO_WIDTH = 0.06f;
+
+/**
+ * @brief Get coordinate value for a halo-aware index in a given list of
+ * spacepoints.
+ *
+ * For non-halo points, this function simply returns the phi value of the
+ * point. But for halo points, it calculates the proper adjusted phi value. For
+ * points in the range -π ≤ φ ≤ -π + ε, this is φ + 2π. For points in the range
+ * π - ε ≤ φ ≤ π it is φ - 2π.
+ *
+ * @param[in] spacepoints The array of spacepoints.
+ * @param[in] i The halo-aware index in the spacepoint array.
+ *
+ * @return The ϕ value of the spacepoint with the given index.
+ */
+__device__ float get_coordinate(
+    const internal_spacepoint<spacepoint>* const __restrict__ spacepoints,
+    uint32_t i, kd_tree_t::pivot_e dim) {
+    bool is_real = !(i & AUXILIARY_INDEX_MASK);
+
+    if (is_real) {
+        if (dim == kd_tree_t::pivot_e::Phi) {
+            return spacepoints[i].phi();
+        } else if (dim == kd_tree_t::pivot_e::R) {
+            return spacepoints[i].radius();
+        } else {
+            return spacepoints[i].z();
+        }
+    } else {
+        uint32_t real_i = i & ~AUXILIARY_INDEX_MASK;
+
+        if (dim == kd_tree_t::pivot_e::Phi) {
+            float phi = spacepoints[real_i].phi();
+
+            if (phi >= 0) {
+                return phi - 2 * M_PI;
+            } else {
+                return phi + 2 * M_PI;
+            }
+        } else if (dim == kd_tree_t::pivot_e::R) {
+            return spacepoints[real_i].radius();
+        } else {
+            return spacepoints[real_i].z();
+        }
+    }
+}
+
+/**
+ * @brief Initialize the index vector used to build the k-d tree.
+ *
+ * Our k-d tree construction algorithm keeps track of spacepoints using a big
+ * index array, which is initialized here. Given n spacepoints, this has n + m
+ * elements, where m is the number of halo points.
+ *
+ * @param[in] spacepoints The list of spacepoints.
+ * @param[out] indices The output array to write to.
+ * @param[in] sps The total number of spacepoints.
+ * @param[out] extra_indices The output halo point count.
+ */
+__global__ void initialize_index_vector(
+    const internal_spacepoint<spacepoint>* const __restrict__ spacepoints,
+    uint32_t* __restrict__ indices, uint32_t sps,
+    uint32_t* __restrict__ extra_indices) {
+    cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+
+    if (grid.thread_rank() < sps) {
+        indices[grid.thread_rank()] = grid.thread_rank();
+
+        /*
+         * If we are a halo point, add an extra index for our evil twin!
+         */
+        if (spacepoints[grid.thread_rank()].phi() < (-M_PI + PHI_HALO_WIDTH) ||
+            spacepoints[grid.thread_rank()].phi() > (M_PI - PHI_HALO_WIDTH)) {
+            indices[sps + atomicAdd(extra_indices, 1u)] =
+                grid.thread_rank() | AUXILIARY_INDEX_MASK;
+        }
+    }
+}
+
+/**
+ * @brief Kernel that initializes the k-d tree.
+ *
+ * This kernel simply sets every node in a given k-d tree to be non-extant
+ * except for the root (at index 0) which is considered an incomplete node.
+ *
+ * @param[out] tree The array of tree nodes to write to.
+ * @param[in] num_nodes The total number of tree nodes.
+ * @param[in] num_indices The total number of indices.
+ */
+__global__ void initialize_kd_tree(kd_tree_t* __restrict__ tree,
+                                   uint32_t num_nodes, uint32_t num_indices) {
+    cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+
+    if (grid.thread_rank() == 0) {
+        /*
+         * The root node starts as an incomplete node covering all indices in
+         * the index array. Its range is initialized as an infinite range.
+         */
+        tree[0].type = kd_tree_t::nodetype_e::INCOMPLETE;
+        tree[0].incomplete.begin = 0;
+        tree[0].incomplete.end = num_indices;
+        tree[0].range = range3d::Infinite();
+    } else if (grid.thread_rank() < num_nodes) {
+        /*
+         * All other nodes are non-extant.
+         */
+        tree[grid.thread_rank()].type = kd_tree_t::nodetype_e::NON_EXTANT;
+    }
+}
+
+/**
+ * @brief Helper function to determine the next pivot axis.
+ *
+ * k-d trees work by pivoting along a certain axis. In this implementation, we
+ * simply cycle through dimensions because that's very fast. We go in the order
+ * ϕ to r to z, and then repeat.
+ *
+ * @param[in] p The old pivot axis.
+ *
+ * @return The new pivot axis.
+ */
+__device__ typename kd_tree_t::pivot_e next_pivot(
+    typename kd_tree_t::pivot_e p) {
+    if (p == kd_tree_t::pivot_e::Phi) {
+        return kd_tree_t::pivot_e::R;
+    } else if (p == kd_tree_t::pivot_e::R) {
+        return kd_tree_t::pivot_e::Z;
+    } else {
+        return kd_tree_t::pivot_e::Phi;
+    }
+}
+
+/**
+ * @brief Approximate mean finding algorithm for large numbers of spacepoints.
+ *
+ * When we have a lot of spacepoints to find the median for, doing so exactly
+ * is too slow. Thus, we use a sampling approach; we select a subset of points
+ * and calculate the median of those points.
+ *
+ * @param[in] spacepoints The list of spacepoints.
+ * @param[inout] indices The list of indices for _this_ computation.
+ * @param[in] num_indices The number of indices for _this_ function call.
+ * @param[in] pivot The requested axis along which to split.
+ *
+ * @return The approximate median value of the given indices in the given
+ * pivot axis.
+ */
+__device__ float get_approximate_median(
+    const internal_spacepoint<spacepoint>* const __restrict__ spacepoints,
+    uint32_t* __restrict__ indices, uint32_t num_indices,
+    typename kd_tree_t::pivot_e pivot) {
+    cooperative_groups::thread_block block =
+        cooperative_groups::this_thread_block();
+
+    /*
+     * First, we declare some shared memory which will hold our samples from
+     * which to compute the median. We assume a maximum of one item per thread.
+     */
+    extern __shared__ float values[];
+
+    /*
+     * Assert that the number of indices is at least as big as the block size,
+     * otherwise this computation might not work as designed!
+     */
+    assert(num_indices >= block.size());
+
+    /*
+     * Calculate the stride between samples in the spacepoint array.
+     */
+    uint32_t delta = num_indices / block.size();
+
+    /*
+     * Calculate the index for the current thread.
+     */
+    uint32_t index = indices[(block.thread_rank() * delta) % num_indices];
+
+    /*
+     * Calculate the value of the chosen spacepoint in the requested pivot
+     * dimension and store it in the shared memory.
+     */
+    values[block.thread_rank()] = get_coordinate(spacepoints, index, pivot);
+
+    bool sorted;
+
+    block.sync();
+
+    /*
+     * Implementation of odd-even sort to make sure that the values in our
+     * shared memory are sorted. For small ranges as we have here, this is
+     * reasonably efficient.
+     */
+    do {
+        sorted = true;
+
+        /*
+         * Odd step.
+         */
+        for (uint32_t j = 2 * block.thread_rank() + 1; j < block.size() - 1;
+             j += 2 * block.size()) {
+            float v1 = values[j];
+            float v2 = values[j + 1];
+
+            if (v1 > v2) {
+                values[j + 1] = v1;
+                values[j] = v2;
+                sorted = false;
+            }
+        }
+
+        block.sync();
+
+        /*
+         * Even step.
+         */
+        for (uint32_t j = 2 * block.thread_rank(); j < block.size() - 1;
+             j += 2 * block.size()) {
+            float v1 = values[j];
+            float v2 = values[j + 1];
+
+            if (v1 > v2) {
+                values[j + 1] = v1;
+                values[j] = v2;
+                sorted = false;
+            }
+        }
+    } while (__syncthreads_or(!sorted));
+
+    /*
+     * Now that our sorting is complete, we can simply return the middle value
+     * in the range to get the median.
+     */
+    return values[(block.size() + 1) / 2];
+}
+
+/**
+ * @brief Exact mean finding algorithm for small ranges.
+ *
+ * When the array of indices is relatively small, we can afford to find the
+ * exact median. We do this by sorting the data and then returning the index
+ * (rather than the value) of the median.
+ *
+ * @warning This function has the side effect that it leaves the index array
+ * sorted!
+ *
+ * @param[in] spacepoints The array of spacepoints.
+ * @param[inout] indices The array of indices for this function call.
+ * @param[in] num_indices The number of indices for this function call.
+ * @param[in] pivot The requested pivot axis.
+ */
+__device__ uint32_t sort_and_get_median(
+    const internal_spacepoint<spacepoint>* const __restrict__ spacepoints,
+    uint32_t* __restrict__ indices, uint32_t num_indices,
+    typename kd_tree_t::pivot_e pivot) {
+    cooperative_groups::thread_block block =
+        cooperative_groups::this_thread_block();
+
+    /*
+     * Simple perform an odd-even sort to sort the range of indices by the
+     * value of the corresponding SPs in the requested pivot axis.
+     */
+    bool sorted;
+
+    do {
+        sorted = true;
+
+        /*
+         * Odd step.
+         */
+        for (uint32_t j = 2 * block.thread_rank() + 1; j < num_indices - 1;
+             j += 2 * block.size()) {
+            float wJ = get_coordinate(spacepoints, indices[j], pivot);
+            float wM = get_coordinate(spacepoints, indices[j + 1], pivot);
+
+            bool swap = wJ > wM;
+            uint32_t e = indices[j + 1];
+
+            if (swap) {
+                indices[j + 1] = indices[j];
+                indices[j] = e;
+                sorted = false;
+            }
+        }
+
+        block.sync();
+
+        /*
+         * Even step.
+         */
+        for (uint32_t j = 2 * block.thread_rank(); j < num_indices - 1;
+             j += 2 * block.size()) {
+            float wJ = get_coordinate(spacepoints, indices[j], pivot);
+            float wM = get_coordinate(spacepoints, indices[j + 1], pivot);
+
+            bool swap = wJ > wM;
+            uint32_t e = indices[j + 1];
+
+            if (swap) {
+                indices[j + 1] = indices[j];
+                indices[j] = e;
+                sorted = false;
+            }
+        }
+    } while (__syncthreads_or(!sorted));
+
+    /*
+     * Remember, we are returning an index here, not a value!
+     */
+    return (num_indices + 1) / 2;
+}
+
+/**
+ * @brief Create a leaf node in the given k-d tree.
+ *
+ * @param[inout] tree The tree in which to create the leaf node.
+ * @param[in] spacepoints The global array of spacepoints.
+ * @param[in] indices The indices from which to create a leaf node.
+ * @param[in] nid The node index.
+ */
+__device__ bool create_leaf_node(
+    kd_tree_t* __restrict__ tree,
+    const internal_spacepoint<spacepoint>* const __restrict__ spacepoints,
+    const uint32_t* const __restrict__ indices, int nid) {
+    cooperative_groups::thread_block block =
+        cooperative_groups::this_thread_block();
+    /*
+     * Get the current node.
+     */
+    kd_tree_t& node = tree[nid];
+
+    /*
+     * Set the node to a leaf, and set the number of points it contains.
+     */
+    if (block.thread_rank() == 0) {
+        node.type = kd_tree_t::nodetype_e::LEAF;
+        node.leaf.point_count = (node.incomplete.end - node.incomplete.begin);
+    }
+
+    block.sync();
+
+    /*
+     * Store the given points in this leaf, retrieving their exact positions
+     * from the spacepoint array.
+     */
+    for (uint32_t i = block.thread_rank(); i < node.leaf.point_count;
+         i += block.size()) {
+        uint32_t ridx = indices[i] & ~AUXILIARY_INDEX_MASK;
+
+        node.leaf.points[i].value = ridx;
+        node.leaf.points[i].phi =
+            get_coordinate(spacepoints, indices[i], kd_tree_t::pivot_e::Phi);
+        node.leaf.points[i].r = spacepoints[ridx].radius();
+        node.leaf.points[i].z = spacepoints[ridx].z();
+    }
+
+    /*
+     * This function always succeeds.
+     */
+    return true;
+}
+
+/**
+ * @brief Create an internal node in the k-d tree.
+ *
+ * This function constructs an internal node in the k-d tree at a given
+ * position.
+ *
+ * @warning This function has the side-effect of re-ordering the index array.
+ *
+ * @param[inout] tree The k-d tree in which to build the node.
+ * @param[in] spacepoints The spacepoint array.
+ * @param[inout] indices The indices from which to build an internal node.
+ * @param[inout] index_buffer Temporary storage for partitioning the indices.
+ * @param[in] nid The node index.
+ * @param[in] pivot The requested pivot axis.
+ */
+__device__ bool create_internal_node(
+    kd_tree_t* __restrict__ tree,
+    const internal_spacepoint<spacepoint>* const __restrict__ spacepoints,
+    uint32_t* __restrict__ indices, uint32_t* __restrict__ index_buffer,
+    int nid, typename kd_tree_t::pivot_e pivot) {
+    cooperative_groups::thread_block block =
+        cooperative_groups::this_thread_block();
+
+    /*
+     * Get the node and it's left and right children.
+     */
+    kd_tree_t& node = tree[nid];
+    kd_tree_t& lhc = tree[2 * nid + 1];
+    kd_tree_t& rhc = tree[2 * nid + 2];
+
+    /*
+     * Start the construction of the two children, only if we are the leader
+     * thread.
+     */
+    if (block.thread_rank() == 0) {
+        lhc.type = kd_tree_t::nodetype_e::INCOMPLETE;
+        rhc.type = kd_tree_t::nodetype_e::INCOMPLETE;
+    }
+
+    /*
+     * Let's hope this doesn't fire, but if we have zero (or god forbid a
+     * negative number) nodes, we have a big problem.
+     */
+    assert(node.incomplete.end > node.incomplete.begin);
+
+    uint32_t num_indices = node.incomplete.end - node.incomplete.begin;
+
+    block.sync();
+
+    float mid_point_value;
+
+    /*
+     * We have two different branches here; for small nodes we take an exact
+     * approach, for big nodes we make approximations.
+     */
+    if (num_indices > block.size()) {
+        /*
+         * These counters are used for partitioning, and count how many points
+         * we have that are greater than our pivot, as well as how many are
+         * less than the pivot.
+         */
+        __shared__ uint32_t lower_idx, upper_idx;
+
+        if (block.thread_rank() == 0) {
+            lower_idx = 0;
+            upper_idx = 0;
+        }
+
+        /*
+         * Find the approximate median value of the given spacepoints in the
+         * selected dimension.
+         */
+        float mid_point =
+            get_approximate_median(spacepoints, indices, num_indices, pivot);
+
+        block.sync();
+
+        /*
+         * Now, we partition the spacepoints (or rather, their indices) into
+         * lower and upper points, counting the number of elements we have in
+         * each.
+         */
+        for (uint32_t i = block.thread_rank(); i < num_indices;
+             i += block.size()) {
+            if (get_coordinate(spacepoints, indices[i], pivot) >= mid_point) {
+                index_buffer[num_indices - (1u + atomicAdd(&upper_idx, 1u))] =
+                    indices[i];
+            } else {
+                index_buffer[atomicAdd(&lower_idx, 1u)] = indices[i];
+            }
+        }
+
+        block.sync();
+
+        /*
+         * Next, move the spacepoints from the partitioned buffer back into the
+         * main index array.
+         */
+        for (uint32_t i = block.thread_rank(); i < num_indices;
+             i += block.size()) {
+            indices[i] = index_buffer[i];
+        }
+
+        block.sync();
+
+        /*
+         * Finally, set the beginning and end of our children, as well as the
+         * splitting point and the current node.
+         */
+        if (block.thread_rank() == 0) {
+            lhc.incomplete.begin = node.incomplete.begin;
+            lhc.incomplete.end = lhc.incomplete.begin + lower_idx;
+
+            rhc.incomplete.begin = lhc.incomplete.end;
+            rhc.incomplete.end = node.incomplete.end;
+
+            mid_point_value = mid_point;
+        }
+
+        block.sync();
+    } else {
+        /*
+         * For small ranges, we just get the exact median. We have a helper
+         * function for this which also sorts the indices as a side-effect, and
+         * so we do not actually need to do any partitioning.
+         */
+        uint32_t mid_point =
+            sort_and_get_median(spacepoints, indices, num_indices, pivot);
+
+        /*
+         * Correctly set the beginning and the of our children, as well as our
+         * own pivot value.
+         */
+        if (block.thread_rank() == 0) {
+            lhc.incomplete.begin = node.incomplete.begin;
+            lhc.incomplete.end = lhc.incomplete.begin + mid_point;
+
+            rhc.incomplete.begin = lhc.incomplete.end;
+            rhc.incomplete.end = node.incomplete.end;
+
+            mid_point_value =
+                get_coordinate(spacepoints, indices[mid_point], pivot);
+        }
+
+        block.sync();
+    }
+
+    /*
+     * Now that the partitioning is complete, we can have our leader thread
+     * put the dots on the i's and complete this internal node.
+     */
+    if (block.thread_rank() == 0) {
+        node.type = kd_tree_t::nodetype_e::INTERNAL;
+        node.internal.dim = pivot;
+        node.internal.mid = mid_point_value;
+
+        lhc.range = node.range;
+        rhc.range = node.range;
+
+        /*
+         * We need to make sure that we update the ranges of our children!
+         */
+        if (node.internal.dim == kd_tree_t::pivot_e::Phi) {
+            lhc.range.phi_max = mid_point_value;
+            rhc.range.phi_min = mid_point_value;
+        } else if (node.internal.dim == kd_tree_t::pivot_e::R) {
+            lhc.range.r_max = mid_point_value;
+            rhc.range.r_min = mid_point_value;
+        } else {
+            lhc.range.z_max = mid_point_value;
+            rhc.range.z_min = mid_point_value;
+        }
+    }
+
+    /*
+     * This function also always succeeds.
+     */
+    return true;
+}
+
+/**
+ * @brief Main k-d tree construction kernel.
+ *
+ * A k-d tree is constructed by repeated application of this kernel. This
+ * kernel uses a one-node-per-block approach. This is not optimal for early
+ * iteration counts.
+ *
+ * @warning This kernel assumes that the tree has already been initialized
+ * through the preceding kernels.
+ *
+ * @param[inout] tree The tree array in which to build.
+ * @param[in] num_nodes The maximum size of the k-d tree.
+ * @param[in] spacepoints The spacepoints to encode in the tree.
+ * @param[inout] _indices The index array to use.
+ * @param[inout] _index_buffer Temporary buffer space for indices.
+ * @param[in] iteration The iteration count.
+ */
+__global__ void construct_kd_tree_step(
+    kd_tree_t* __restrict__ tree, uint32_t num_nodes,
+    const internal_spacepoint<spacepoint>* const __restrict__ spacepoints,
+    uint32_t* __restrict__ _indices, uint32_t* __restrict__ _index_buffer,
+    uint32_t iteration) {
+    cooperative_groups::thread_block block =
+        cooperative_groups::this_thread_block();
+
+    /*
+     * Calculate the current node ID. Remember that we start with 1 node
+     * indexed at [0], then 2 nodes at [1, 2], then 4 nodes at [3, 4, 5, 6],
+     * etc. Thus, the subarray for zero-indexed iteration i starts at 2^i - 1.
+     */
+    int nid = ((1 << iteration) - 1) + blockIdx.x;
+
+    if (nid >= num_nodes) {
+        return;
+    }
+
+    /*
+     * Get the current node using the node ID.
+     */
+    kd_tree_t& node = tree[nid];
+
+    /*
+     * Useful assertion for debugging, to ensure that this kernel only ever
+     * encounters incomplete or non-extant nodes, as it should.
+     */
+    [[maybe_unused]] bool is_correct_node_type =
+        node.type == kd_tree_t::nodetype_e::INCOMPLETE ||
+        node.type == kd_tree_t::nodetype_e::NON_EXTANT;
+
+    assert(is_correct_node_type);
+
+    /*
+     * Note that this is not equivalent per se to the above, and that this is
+     * not an error. But it does indicate that there is no work to be done.
+     */
+    if (node.type != kd_tree_t::nodetype_e::INCOMPLETE) {
+        return;
+    }
+
+    bool build_leaf = node.incomplete.end - node.incomplete.begin <=
+                      kd_tree_t::MAX_NODES_PER_LEAF;
+
+    block.sync();
+
+    /*
+     * Check to see whether we could fit the range assigned to this block into
+     * a leaf node. If so, we construct one. Otherwise, we construct an
+     * internal node.
+     */
+    if (build_leaf) {
+        [[maybe_unused]] bool r = create_leaf_node(
+            tree, spacepoints, &_indices[node.incomplete.begin], nid);
+
+        assert(r);
+    } else {
+        [[maybe_unused]] bool r = false;
+
+        typename kd_tree_t::pivot_e pivot;
+
+        /*
+         * If we are the root node, our pivot axis is φ; otherwise, it's what-
+         * ever is next after our parent's pivot axis.
+         */
+        if (nid == 0) {
+            pivot = kd_tree_t::pivot_e::Phi;
+        } else {
+            pivot = next_pivot(tree[(nid - 1) / 2].internal.dim);
+        }
+
+        /*
+         * Try to create an internal node with the given pivot.
+         */
+        r = create_internal_node(
+            tree, spacepoints, &_indices[node.incomplete.begin],
+            &_index_buffer[node.incomplete.begin], nid, pivot);
+
+        /*
+         * Check whether the process succeeded. It should.
+         */
+        assert(r);
+    }
+}
+
+uint32_t round_to_power_2(uint32_t i) {
+    uint32_t power = 1u;
+
+    while (power < i) {
+        power *= 2u;
+    }
+
+    return power;
+}
+
+std::pair<vecmem::unique_alloc_ptr<kd_tree_t[]>, uint32_t> create_kd_tree(
+    vecmem::memory_resource& mr,
+    const internal_spacepoint<spacepoint>* const spacepoints, uint32_t n_sp) {
+    /*
+     * Allocate space for the indices. Since each spacepoint can produce at
+     * most two indices, we allocate enough space to support this theoretical
+     * upper bound.
+     */
+    vecmem::unique_alloc_ptr<uint32_t[]> indices =
+        vecmem::make_unique_alloc<uint32_t[]>(mr, 2 * n_sp);
+
+    /*
+     * Allocate an on-device counter for the number of halo points we have,
+     * which starts out at zero.
+     */
+    vecmem::unique_alloc_ptr<uint32_t> extra_indices =
+        vecmem::make_unique_alloc<uint32_t>(mr);
+
+    CUDA_ERROR_CHECK(cudaMemset(extra_indices.get(), 0, sizeof(uint32_t)));
+
+    /*
+     * Launch the index initialization kernel, which turns n spacepoints into
+     * anywhere between n and 2n indices in the index array.
+     */
+    std::size_t threads_per_block1 = 256;
+    initialize_index_vector<<<(n_sp / threads_per_block1) +
+                                  (n_sp % threads_per_block1 == 0 ? 0 : 1),
+                              threads_per_block1>>>(spacepoints, indices.get(),
+                                                    n_sp, extra_indices.get());
+
+    CUDA_ERROR_CHECK(cudaGetLastError());
+    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+
+    /*
+     * Retrieve the number of halo points from the device.
+     */
+    uint32_t extra_indices_h;
+
+    CUDA_ERROR_CHECK(cudaMemcpy(&extra_indices_h, extra_indices.get(),
+                                sizeof(uint32_t), cudaMemcpyDeviceToHost));
+
+    /*
+     * The total number of indices is equal to the number of spacepoints plus
+     * the number of halo points.
+     */
+    uint32_t num_indices = n_sp + extra_indices_h;
+
+    /*
+     * In theory, the number of nodes that is needed to support N points given
+     * K points per node is 2^⌈log_2⌈N / ⌊K / 2⌋⌉ + 1⌉ - 1. In practice, we
+     * build potentially _very_ unbalanced trees. Thus, we need a bit more
+     * space.
+     */
+    uint32_t largest_layer = round_to_power_2(num_indices) / 4;
+    uint32_t num_nodes = 4 * largest_layer - 1;
+
+    /*
+     * Allocate space in which the k-d tree will live.
+     */
+    vecmem::unique_alloc_ptr<kd_tree_t[]> kd_tree_device =
+        vecmem::make_unique_alloc<kd_tree_t[]>(mr, num_nodes);
+
+    /*
+     * Run the initialization kernel for the tree itself, which sets the
+     * initial types of the nodes.
+     */
+    std::size_t threads_per_block2 = 256;
+    initialize_kd_tree<<<(num_nodes / threads_per_block2) +
+                             (num_nodes % threads_per_block2 == 0 ? 0 : 1),
+                         threads_per_block2>>>(kd_tree_device.get(), num_nodes,
+                                               num_indices);
+
+    CUDA_ERROR_CHECK(cudaGetLastError());
+    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+
+    /*
+     * Allocate some temporary space for the index buffer, one half for "lower"
+     * points, and one for "upper" points, defined relative to some pivot
+     * point. Multiple nodes may use this buffer to do their partitions at the
+     * same time.
+     */
+    vecmem::unique_alloc_ptr<uint32_t[]> index_buffer =
+        vecmem::make_unique_alloc<uint32_t[]>(mr, num_indices);
+
+    /*
+     * Repeatedly apply the k-d tree construction kernel. The iteration width
+     * is the number of nodes that is processed in parallel, and is simply 2^i
+     * where i is the iteration. We can stop running kernels after we cover the
+     * entire last level of the tree.
+     */
+    for (uint32_t iteration = 0; (1u << iteration) <= largest_layer;
+         ++iteration) {
+        uint32_t width = 1u << iteration;
+        uint32_t items_per_node = largest_layer / width;
+        uint32_t threads_per_block = std::clamp(items_per_node, 32u, 256u);
+
+        construct_kd_tree_step<<<width, threads_per_block,
+                                 threads_per_block * sizeof(float)>>>(
+            kd_tree_device.get(), num_nodes, spacepoints, indices.get(),
+            index_buffer.get(), iteration);
+
+        CUDA_ERROR_CHECK(cudaGetLastError());
+        CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+    }
+
+    return {std::move(kd_tree_device), num_nodes};
+}
+}  // namespace traccc::cuda
diff --git a/device/cuda/src/seeding2/kernels/seed_finding_kernel.cu b/device/cuda/src/seeding2/kernels/seed_finding_kernel.cu
new file mode 100644
index 0000000000..61a2ed0570
--- /dev/null
+++ b/device/cuda/src/seeding2/kernels/seed_finding_kernel.cu
@@ -0,0 +1,715 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#include <cooperative_groups.h>
+
+#include <traccc/cuda/seeding2/kernels/seed_finding_kernel.hpp>
+#include <traccc/cuda/seeding2/types/internal_seed.hpp>
+#include <traccc/cuda/seeding2/types/kd_tree.hpp>
+#include <traccc/cuda/seeding2/types/range3d.hpp>
+#include <traccc/cuda/utils/definitions.hpp>
+#include <traccc/edm/internal_spacepoint.hpp>
+#include <traccc/edm/spacepoint.hpp>
+#include <traccc/seeding/detail/lin_circle.hpp>
+#include <traccc/seeding/doublet_finding_helper.hpp>
+#include <traccc/seeding/triplet_finding_helper.hpp>
+
+#define THREADS_PER_BLOCK 32u
+#define MAX_LOWER_SP_PER_MIDDLE 200u
+#define MAX_UPPER_SP_PER_MIDDLE 200u
+#define KD_TREE_TRAVERSAL_STACK_SIZE 256u
+
+/**
+ * @brief Maximum difference in φ between two spacepoints adjacent in the same
+ * seed.
+ */
+static constexpr float MAX_DELTA_PHI = 0.06f;
+
+namespace traccc::cuda {
+/**
+ * @brief Determine whether two spacepoints are a valid pair.
+ *
+ * @param[in] finder_conf The seed finder configuration to se.
+ * @param[in] lo The lower spacepoint.
+ * @param[in] hi The higher spacepoint.
+ *
+ * @return True iff the pair is valid.
+ */
+__device__ bool is_valid_pair(const seedfinder_config finder_conf,
+                              const internal_spacepoint<spacepoint>& lo,
+                              const internal_spacepoint<spacepoint>& hi) {
+    float deltaR = hi.radius() - lo.radius();
+
+    if (deltaR > finder_conf.deltaRMax) {
+        return false;
+    }
+
+    if (deltaR < finder_conf.deltaRMin) {
+        return false;
+    }
+
+    float cot_theta = (hi.z() - lo.z()) / deltaR;
+
+    if (fabs(cot_theta) > finder_conf.cotThetaMax) {
+        return false;
+    }
+
+    float z_origin = lo.z() - lo.radius() * cot_theta;
+
+    if (z_origin < finder_conf.collisionRegionMin ||
+        z_origin > finder_conf.collisionRegionMax) {
+        return false;
+    }
+
+    return true;
+}
+
+/**
+ * @brief Retrieve values from a k-d tree in a given range and write them to
+ * an output array.
+ *
+ * Performs depth-first traversal of the k-d tree, looking for candidate
+ * spacepoints.
+ *
+ * @param[in] input Data packet for the seeding algorithm.
+ * @param[in] range 3D range in which to search.
+ * @param[in] mi_idx Index of the middle spacepoint.
+ * @param[in] upper True iff we're looking for upper spacepoints.
+ * @param[out] output_arr Output array to write candidates to.
+ * @param[out] output_cnt Total number of output candidates.
+ */
+__device__ void retrieve_from_tree(const seed_finding_data_t& input,
+                                   const range3d range, uint32_t mi_idx,
+                                   bool upper,
+                                   uint32_t* __restrict__ output_arr,
+                                   uint32_t* __restrict__ output_cnt) {
+    __shared__ uint32_t stack[KD_TREE_TRAVERSAL_STACK_SIZE];
+    __shared__ uint32_t stack_idx;
+
+    cooperative_groups::thread_block block =
+        cooperative_groups::this_thread_block();
+
+    __shared__ uint32_t leaves[256], leaf_num;
+
+    /*
+     * Retrieve the current spacepoint.
+     */
+    const internal_spacepoint<spacepoint> mi = input.spacepoints[mi_idx];
+
+    /*
+     * The leader thread initializes the DFS stack by pushing the root node to
+     * it.
+     */
+    if (block.thread_rank() == 0) {
+        leaf_num = 0;
+        stack_idx = 0;
+    }
+
+    block.sync();
+
+    uint32_t idx = block.size() - 1 + block.thread_rank();
+
+    assert(input.tree[idx].type == kd_tree_t::nodetype_e::INTERNAL);
+
+    if (input.tree[idx].range.intersects(range)) {
+        stack[atomicAdd(&stack_idx, 1u)] = idx;
+    }
+
+    block.sync();
+
+    /*
+     * The traversal runs until the stack is empty. Or until the entire kernel
+     * explodes and crashes!
+     */
+    while (stack_idx > 0) {
+        /*
+         * We perform this traversal in parallel, so multiple threads pop a
+         * value from the stack at the same time. Of course, this can only
+         * happen if the thread index fits within the current size of the
+         * stack.
+         */
+        if (block.thread_rank() < stack_idx) {
+            uint32_t current = stack[atomicSub(&stack_idx, 1u) - 1];
+
+            const kd_tree_t node = input.tree[current];
+
+            /*
+             * The node must be a leaf node or an internal node, otherwise we
+             * cannot (safely) traverse it.
+             */
+            assert(node.type == kd_tree_t::nodetype_e::LEAF ||
+                   node.type == kd_tree_t::nodetype_e::INTERNAL);
+
+            /*
+             * Choose the appropriate code path based on whether the node is a
+             * leaf or an internal node.
+             */
+            if (node.type == kd_tree_t::nodetype_e::LEAF) {
+                leaves[atomicAdd(&leaf_num, 1u)] = current;
+            } else if (node.type == kd_tree_t::nodetype_e::INTERNAL &&
+                       node.range.intersects(range)) {
+                /*
+                 * If we are an internal node and we intersect the search
+                 * range, we place both children on the stack. Only the leader
+                 * thread does this.
+                 */
+                uint32_t idx = atomicAdd(&stack_idx, 2u);
+                assert(idx + 1 < KD_TREE_TRAVERSAL_STACK_SIZE);
+
+                stack[idx] = 2 * current + 1;
+                stack[idx + 1] = 2 * current + 2;
+            }
+        }
+
+        /*
+         * Enforce synchronization of the block between DFS steps.
+         */
+        block.sync();
+    }
+
+    for (uint32_t i = block.thread_rank();
+         i < kd_tree_t::MAX_NODES_PER_LEAF * leaf_num; i += block.size()) {
+        uint32_t node_id = i / kd_tree_t::MAX_NODES_PER_LEAF;
+        uint32_t point_id = i % kd_tree_t::MAX_NODES_PER_LEAF;
+
+        const kd_tree_t& node = input.tree[leaves[node_id]];
+
+        /*
+         * For leaf nodes, the tile performs a parallel process in
+         * which candidate points are stored in the k-d tree.
+         */
+        const kd_tree_t::element_wrapper_t p = node.leaf.points[point_id];
+
+        if (range.contains(p.phi, p.r, p.z) && p.value != mi_idx) {
+            const internal_spacepoint<spacepoint> o =
+                input.spacepoints[p.value];
+
+            /*
+             * Ensure that the point is actually valid using cuts which
+             * are more specific than those encoded in the search
+             * range.
+             */
+            if (is_valid_pair(input.finder_conf, upper ? mi : o,
+                              upper ? o : mi)) {
+                /*
+                 * Reserve a spot in the output for this point, and
+                 * then store it if it would not exceed the output
+                 * array size.
+                 */
+                uint32_t idx = atomicAdd(output_cnt, 1u);
+
+                assert((upper && idx < MAX_UPPER_SP_PER_MIDDLE) ||
+                       (!upper && idx < MAX_LOWER_SP_PER_MIDDLE));
+
+                output_arr[idx] = p.value;
+            }
+        }
+    }
+
+    block.sync();
+}
+
+/**
+ * @brief Create an internal seed from three spacepoints.
+ *
+ * @param[in] input Data packet with seed finding data.
+ * @param[in] spacepoints The array of spacepoints.
+ * @param[in] lower The index of the lower spacepoint.
+ * @param[in] middle The index of the middle spacepoint.
+ * @param[in] upper The index of the upper spacepoint.
+ *
+ * @return An internal seed iff the triplet is valid, otherwise an invalid
+ * seed.
+ */
+__device__ internal_seed
+make_seed(const seed_finding_data_t& input,
+          const internal_spacepoint<spacepoint>* const spacepoints,
+          uint32_t lower, uint32_t middle, uint32_t upper) {
+    const internal_spacepoint<spacepoint> lo = spacepoints[lower];
+    const internal_spacepoint<spacepoint> mi = spacepoints[middle];
+    const internal_spacepoint<spacepoint> hi = spacepoints[upper];
+
+    /*
+     * Find the lin-circles for the bottom and top pair.
+     */
+    lin_circle lm = doublet_finding_helper::transform_coordinates(mi, lo, true);
+    lin_circle mh =
+        doublet_finding_helper::transform_coordinates(mi, hi, false);
+
+    scalar iSinTheta2 = 1 + lm.cotTheta() * lm.cotTheta();
+    scalar scatteringInRegion2 =
+        input.finder_conf.maxScatteringAngle2 * iSinTheta2;
+    scatteringInRegion2 *=
+        input.finder_conf.sigmaScattering * input.finder_conf.sigmaScattering;
+    scalar curvature, impact_parameter;
+
+    /*
+     * Borrow the compatibility code from the existing seed finding code.
+     */
+    if (triplet_finding_helper::isCompatible(mi, lm, mh, input.finder_conf,
+                                             iSinTheta2, scatteringInRegion2,
+                                             curvature, impact_parameter)) {
+        internal_seed r;
+
+        r.spacepoints[0] = lower;
+        r.spacepoints[1] = middle;
+        r.spacepoints[2] = upper;
+
+        /*
+         * Add a weight based on the impact parameter to the seed.
+         */
+        r.weight = -impact_parameter * input.filter_conf.impactWeightFactor;
+
+        return r;
+    } else {
+        /*
+         * If the triplet is invalid, return a bogus seed.
+         */
+        return internal_seed::Invalid();
+    }
+}
+
+/**
+ * @brief Get the basic Range3D object
+ *
+ * This contains only the most basic cuts, but is designed to be refined later.
+ *
+ * @param[in] finder_conf The seed finder configuration to use.
+ *
+ * @return A 3D range object.
+ */
+__device__ range3d get_basic_range3d(const seedfinder_config finder_conf) {
+    range3d r = range3d::Infinite();
+
+    r.r_min = std::max(r.r_min, 0.f);
+    r.r_max = std::min(r.r_max, finder_conf.rMax);
+
+    r.z_min = std::max(r.z_min, finder_conf.zMin);
+    r.z_max = std::min(r.z_max, finder_conf.zMax);
+
+    return r;
+}
+
+/**
+ * @brief Return the search range for a lower spacepoint given a middle
+ * spacepoint.
+ *
+ * @param[in] finder_conf The seed finder configuration to use.
+ * @param[in] s The middle spacepoint.
+ *
+ * @return A three-dimensional search range.
+ */
+__device__ range3d get_lower_range3d(const seedfinder_config finder_conf,
+                                     const internal_spacepoint<spacepoint>& s) {
+    range3d r = get_basic_range3d(finder_conf);
+
+    r.r_min = std::max(r.r_min, s.radius() - finder_conf.deltaRMax);
+    r.r_max = std::min(r.r_max, s.radius() - finder_conf.deltaRMin);
+
+    float frac_r = r.r_min / s.radius();
+
+    float z_min2 = (s.z() - finder_conf.collisionRegionMin) * frac_r +
+                   finder_conf.collisionRegionMin;
+    float z_max2 = (s.z() - finder_conf.collisionRegionMax) * frac_r +
+                   finder_conf.collisionRegionMax;
+
+    r.z_min = std::max(r.z_min, std::min(z_min2, s.z()));
+    r.z_max = std::min(r.z_max, std::max(z_max2, s.z()));
+
+    r.z_min = std::max(r.z_min, s.z() - 450.f);
+    r.z_max = std::min(r.z_max, s.z() + 450.f);
+
+    r.phi_min = std::max(r.phi_min, s.phi() - MAX_DELTA_PHI);
+    r.phi_max = std::min(r.phi_max, s.phi() + MAX_DELTA_PHI);
+
+    return r;
+}
+
+/**
+ * @brief Return the search range for an upper spacepoint given a middle
+ * spacepoint.
+ *
+ * @param[in] finder_conf The seed finder configuration to use.
+ * @param[in] s The middle spacepoint.
+ *
+ * @return A three-dimensional search range.
+ */
+__device__ range3d get_upper_range3d(const seedfinder_config finder_conf,
+                                     const internal_spacepoint<spacepoint>& s) {
+    range3d r = get_basic_range3d(finder_conf);
+
+    r.r_min = std::max(r.r_min, s.radius() + finder_conf.deltaRMin);
+    r.r_max = std::min(r.r_max, s.radius() + finder_conf.deltaRMax);
+
+    float z_max2 =
+        (r.r_max / s.radius()) * (s.z() - finder_conf.collisionRegionMin) +
+        finder_conf.collisionRegionMin;
+    float z_min2 =
+        finder_conf.collisionRegionMax -
+        (r.r_max / s.radius()) * (finder_conf.collisionRegionMax - s.z());
+
+    if (s.z() > finder_conf.collisionRegionMin) {
+        r.z_max = std::min(r.z_max, z_max2);
+    } else if (s.z() < finder_conf.collisionRegionMax) {
+        r.z_min = std::max(r.z_min, z_min2);
+    }
+
+    r.z_min = std::max(
+        r.z_min, s.z() - finder_conf.cotThetaMax * (r.r_max - s.radius()));
+    r.z_max = std::min(
+        r.z_max, s.z() + finder_conf.cotThetaMax * (r.r_max - s.radius()));
+
+    r.z_min = std::max(r.z_min, s.z() - 450.f);
+    r.z_max = std::min(r.z_max, s.z() + 450.f);
+
+    r.phi_min = std::max(r.phi_min, s.phi() - MAX_DELTA_PHI);
+    r.phi_max = std::min(r.phi_max, s.phi() + MAX_DELTA_PHI);
+
+    return r;
+}
+
+/**
+ * @brief Seed finding helper function.
+ *
+ * Turns out this function actually does most of the important work.
+ *
+ * @param[in] input Data parcel for seed finding configuration.
+ * @param[inout] internal_seeds The array to write internal seeds to.
+ * @param[in] increasing_z True iff we are looking for increasing-z seeds.
+ */
+__device__ void run_helper(const seed_finding_data_t& input,
+                           internal_seed* internal_seeds, bool increasing_z) {
+    cooperative_groups::thread_block block =
+        cooperative_groups::this_thread_block();
+
+    /*
+     * Allocate shared memory for the lower and upper candidates.
+     */
+    __shared__ uint32_t lower_sps[MAX_LOWER_SP_PER_MIDDLE];
+    __shared__ uint32_t upper_sps[MAX_UPPER_SP_PER_MIDDLE];
+
+    /*
+     * The leader thread initializes the candidate counts to zero.
+     */
+    __shared__ uint32_t num_lower, num_upper;
+
+    if (block.thread_rank() == 0) {
+        num_lower = 0;
+        num_upper = 0;
+    }
+
+    block.sync();
+
+    internal_spacepoint<spacepoint> sp =
+        input.spacepoints[block.group_index().x];
+
+    /*
+     * Retrieve the search range for the lower spacepoints, then adjust it for
+     * monotinicity in z.
+     */
+    range3d lower_range3d = get_lower_range3d(input.finder_conf, sp);
+
+    if (increasing_z) {
+        lower_range3d.z_max = std::min(lower_range3d.z_max, sp.z());
+    } else {
+        lower_range3d.z_min = std::max(lower_range3d.z_min, sp.z());
+    }
+
+    /*
+     * Same as above, but for the upper spacepoint. Note that some of the
+     * operations are reversed.
+     */
+    range3d upper_range3d = get_upper_range3d(input.finder_conf, sp);
+
+    if (increasing_z) {
+        upper_range3d.z_min = std::max(lower_range3d.z_min, sp.z());
+    } else {
+        upper_range3d.z_max = std::min(lower_range3d.z_max, sp.z());
+    }
+
+    /*
+     * If either of the search ranges is degenerate, we can never find any
+     * candidate seeds. So we can safely exit.
+     */
+    if (lower_range3d.phi_min >= lower_range3d.phi_max ||
+        lower_range3d.r_min >= lower_range3d.r_max ||
+        lower_range3d.z_min >= lower_range3d.z_max ||
+        upper_range3d.phi_min >= upper_range3d.phi_max ||
+        upper_range3d.r_min >= upper_range3d.r_max ||
+        upper_range3d.z_min >= upper_range3d.z_max) {
+        return;
+    }
+
+    /*
+     * Retrieve the lower and upper spacepoint candidates from the k-d tree.
+     */
+    retrieve_from_tree(input, lower_range3d, block.group_index().x, false,
+                       lower_sps, &num_lower);
+
+    block.sync();
+
+    /*
+     * If we have zero lower candidates, we might as well quit; we will never
+     * have any candidates!
+     */
+    if (num_lower == 0) {
+        return;
+    }
+
+    retrieve_from_tree(input, upper_range3d, block.group_index().x, true,
+                       upper_sps, &num_upper);
+
+    block.sync();
+
+    /*
+     * Life fast and die young! (It's an early quitting condition)
+     */
+    if (num_upper == 0) {
+        return;
+    }
+
+    /*
+     * Calculate the total number of iterations, but take into account the
+     * maximum size of the lower and upper spacepoint storage arrays.
+     */
+    uint32_t combinations = std::min(num_lower, MAX_LOWER_SP_PER_MIDDLE) *
+                            std::min(num_upper, MAX_UPPER_SP_PER_MIDDLE);
+
+    /*
+     * The iteration count is rounded up to a multiple of the block size to
+     * ensure that none of the threads slack off and exit prematurely.
+     */
+    uint32_t iter_count =
+        combinations + (combinations % block.size() != 0
+                            ? block.size() - (combinations % block.size())
+                            : 0);
+
+    for (uint32_t i = block.thread_rank(); i < iter_count; i += block.size()) {
+        uint32_t lower_id = i / num_upper;
+        uint32_t upper_id = i % num_upper;
+
+        /*
+         * Consider the current combination and turn it into a seed. This seed
+         * may be bogus! But that is okay, because the sorting step below will
+         * automatically separate bogus seeds.
+         */
+        if (lower_id < std::min(num_lower, MAX_LOWER_SP_PER_MIDDLE) &&
+            upper_id < std::min(num_upper, MAX_UPPER_SP_PER_MIDDLE)) {
+            internal_seeds[block.thread_rank()] =
+                make_seed(input, input.spacepoints, lower_sps[lower_id],
+                          block.group_index().x, upper_sps[upper_id]);
+        }
+
+        block.sync();
+
+        bool sorted;
+
+        /*
+         * Perform odd-even sorting, making sure that the seeds with the highest
+         * weight float to the top of the seed array, which is the part that
+         * will be written to the global output.
+         */
+        do {
+            sorted = true;
+
+            /*
+             * Odd step.
+             */
+            for (uint32_t j = 2 * block.thread_rank() + 1;
+                 j < block.size() + input.finder_conf.maxSeedsPerSpM - 1;
+                 j += 2 * block.size()) {
+                bool swap =
+                    internal_seeds[j].weight > internal_seeds[j + 1].weight;
+                internal_seed e = internal_seeds[j + 1];
+
+                if (swap) {
+                    internal_seeds[j + 1] = internal_seeds[j];
+                    internal_seeds[j] = e;
+                    sorted = false;
+                }
+            }
+
+            block.sync();
+
+            /*
+             * Even step.
+             */
+            for (uint32_t j = 2 * block.thread_rank();
+                 j < block.size() + input.finder_conf.maxSeedsPerSpM - 1;
+                 j += 2 * block.size()) {
+                bool swap =
+                    internal_seeds[j].weight > internal_seeds[j + 1].weight;
+                internal_seed e = internal_seeds[j + 1];
+
+                if (swap) {
+                    internal_seeds[j + 1] = internal_seeds[j];
+                    internal_seeds[j] = e;
+                    sorted = false;
+                }
+            }
+        } while (__syncthreads_or(!sorted));
+    }
+}
+
+/**
+ * @brief Main seed finding kernel.
+ *
+ * @note This kernel assumes exactly one warp per block and operates in a
+ * one-block-per-middle-SP fashion.
+ *
+ */
+__global__ void __launch_bounds__(32)
+    seed_finding_kernel(const seed_finding_data_t input,
+                        internal_seed* output_seeds,
+                        uint32_t* output_seed_size) {
+    cooperative_groups::thread_block block =
+        cooperative_groups::this_thread_block();
+
+    /*
+     * The seed finding configuration defines a few criteria that allow us to
+     * ignore some of the middle spacepoint candidates.
+     */
+    if (input.spacepoints[block.group_index().x].phi() <
+            input.finder_conf.phiMin ||
+        input.spacepoints[block.group_index().x].phi() >
+            input.finder_conf.phiMax ||
+        input.spacepoints[block.group_index().x].radius() <
+            input.finder_conf.rMin ||
+        input.spacepoints[block.group_index().x].radius() >
+            input.finder_conf.rMax ||
+        input.spacepoints[block.group_index().x].z() < input.finder_conf.zMin ||
+        input.spacepoints[block.group_index().x].z() > input.finder_conf.zMax) {
+        return;
+    }
+
+    /*
+     * The internal seeds are stored in this shared array.
+     */
+    extern __shared__ internal_seed internal_seeds[];
+
+    /*
+     * Initialize the internal seed array to contain only bogus seeds.
+     */
+    for (uint32_t i = block.thread_rank();
+         i < block.size() + input.finder_conf.maxSeedsPerSpM;
+         i += block.size()) {
+        internal_seeds[i].weight = std::numeric_limits<float>::lowest();
+    }
+
+    block.sync();
+
+    /*
+     * Run the actual seed finding twice; once with monotonically increasing
+     * z values, and once with monotonically decreasing ones.
+     */
+    run_helper(input, internal_seeds, true);
+    run_helper(input, internal_seeds, false);
+
+    /*
+     * We have now completed the seeding algorithm, and we finish by writing
+     * the seeds to our output array. This task is reserved for the leader
+     * thread only, and all the other threads are idle. Thankfully, this should
+     * not take much time at all.
+     */
+    if (block.thread_rank() == 0) {
+        /*
+         * First, we count the number of valid seeds that we will want to write
+         * to the global memory.
+         */
+        uint32_t num_valid = 0;
+
+        for (uint32_t i = 0; i < input.finder_conf.maxSeedsPerSpM; ++i) {
+            if (internal_seeds[block.size() + i].weight > -1000000.0f) {
+                ++num_valid;
+            }
+        }
+
+        /*
+         * Next, reserve space atomically in the output array for our new
+         * seeds.
+         */
+        uint32_t idx = atomicAdd(output_seed_size, num_valid);
+
+        /*
+         * Finally, actually write the output to global memory. Again, this is
+         * done sequentially by the leader thread, but since the number of
+         * seeds is so small it should be fine. Notice that we count down from
+         * the end of the array, because valid seeds are always sorted higher
+         * than invalid ones (if we have any).
+         */
+        for (uint32_t i = 0; i < num_valid; ++i) {
+            output_seeds[idx + i] =
+                internal_seeds[block.size() + input.finder_conf.maxSeedsPerSpM -
+                               (i + 1)];
+        }
+    }
+}
+
+/**
+ * @brief Seeding entry point for C++ code.
+ *
+ * This is basically a wrapper for the seed finding kernel.
+ *
+ * @param[in] finder_conf The seed finder configuration to use.
+ * @param[in] filter_conf The seed filter configuration to use.
+ * @param[inout] mr The memory resource for allocations.
+ * @param[in] spacepoints The spacepoint array.
+ * @param[in] n_sp The total number of spacepoints.
+ * @param[in] tree The k-d tree to use for searching.
+ * @param[in] tree_size The total size of the k-d tree.
+ *
+ * @return A unique pointer to an array of internal seeds, and the size of that
+ * array.
+ */
+std::pair<vecmem::unique_alloc_ptr<internal_seed[]>, uint32_t> run_seeding(
+    seedfinder_config finder_conf, seedfilter_config filter_conf,
+    vecmem::memory_resource& mr,
+    const internal_spacepoint<spacepoint>* const spacepoints, uint32_t n_sp,
+    vecmem::unique_alloc_ptr<kd_tree_t[]>&& tree, uint32_t tree_size) {
+    /*
+     * Allocate space for output of seeds on the device.
+     */
+    vecmem::unique_alloc_ptr<internal_seed[]> seeds_device =
+        vecmem::make_unique_alloc<internal_seed[]>(
+            mr, finder_conf.maxSeedsPerSpM * n_sp);
+
+    /*
+     * Allocate space for seed count on the device.
+     */
+    vecmem::unique_alloc_ptr<uint32_t> seed_count_device =
+        vecmem::make_unique_alloc<uint32_t>(mr);
+    CUDA_ERROR_CHECK(cudaMemset(seed_count_device.get(), 0, sizeof(uint32_t)));
+
+    /*
+     * Calculate the total amount of shared memory on top of that which is
+     * statically defined in the CUDA kernels. This is equal to one seed for
+     * each block plus one seed for each output seed.
+     */
+    std::size_t total_shared_memory =
+        (THREADS_PER_BLOCK + finder_conf.maxSeedsPerSpM) *
+        sizeof(internal_seed);
+
+    /*
+     * Call the kernel and all that jazz.
+     */
+    seed_finding_kernel<<<n_sp, THREADS_PER_BLOCK, total_shared_memory>>>(
+        {finder_conf, filter_conf, spacepoints, tree.get(), tree_size},
+        seeds_device.get(), seed_count_device.get());
+
+    CUDA_ERROR_CHECK(cudaGetLastError());
+    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+
+    /*
+     * Transfer the seed count back to the host and then hand it to the user.
+     */
+    uint32_t seed_count_host;
+    CUDA_ERROR_CHECK(cudaMemcpy(&seed_count_host, seed_count_device.get(),
+                                sizeof(uint32_t), cudaMemcpyDeviceToHost));
+
+    return {std::move(seeds_device), seed_count_host};
+}
+}  // namespace traccc::cuda
diff --git a/device/cuda/src/seeding2/kernels/write_output_kernel.cu b/device/cuda/src/seeding2/kernels/write_output_kernel.cu
new file mode 100644
index 0000000000..29a159abd2
--- /dev/null
+++ b/device/cuda/src/seeding2/kernels/write_output_kernel.cu
@@ -0,0 +1,108 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#include <cooperative_groups.h>
+
+#include <cstdint>
+#include <traccc/cuda/seeding2/kernels/write_output_kernel.hpp>
+#include <traccc/cuda/seeding2/types/internal_seed.hpp>
+#include <traccc/cuda/utils/definitions.hpp>
+#include <traccc/edm/internal_spacepoint.hpp>
+#include <traccc/edm/seed.hpp>
+#include <traccc/edm/spacepoint.hpp>
+#include <traccc/utils/memory_resource.hpp>
+#include <vecmem/containers/device_vector.hpp>
+#include <vecmem/containers/vector.hpp>
+#include <vecmem/utils/cuda/copy.hpp>
+
+namespace traccc::cuda {
+/**
+ * @brief Main output writing kernel.
+ *
+ * @param[in] n_seeds The number of seeds to write.
+ * @param[in] spacepoints Spacepoint array into which seeds are indexed.
+ * @param[in] in_seeds Input internal seeds to read from.
+ * @param[out] out_seeds Output seeds to write to.
+ */
+__global__ void write_output_kernel(
+    uint32_t n_seeds, const internal_spacepoint<spacepoint>* const spacepoints,
+    const internal_seed* const in_seeds,
+    vecmem::data::vector_view<seed> out_seeds) {
+    cooperative_groups::grid_group grid = cooperative_groups::this_grid();
+
+    /*
+     * Create a writable device vector for the seeds.
+     */
+    vecmem::device_vector<seed> dvec(out_seeds);
+
+    int i = grid.thread_rank();
+
+    if (i < n_seeds) {
+        /*
+         * Get the output seed for the current thread.
+         */
+        seed& s = dvec.at(i);
+
+        /*
+         * Write the data back to the output.
+         */
+        s.spB_link.first = spacepoints[in_seeds[i].spacepoints[0]].m_link.first;
+        s.spB_link.second =
+            spacepoints[in_seeds[i].spacepoints[0]].m_link.second;
+        s.spM_link.first = spacepoints[in_seeds[i].spacepoints[1]].m_link.first;
+        s.spM_link.second =
+            spacepoints[in_seeds[i].spacepoints[1]].m_link.second;
+        s.spT_link.first = spacepoints[in_seeds[i].spacepoints[2]].m_link.first;
+        s.spT_link.second =
+            spacepoints[in_seeds[i].spacepoints[2]].m_link.second;
+
+        s.weight = in_seeds[i].weight;
+        s.z_vertex = 0.f;
+    }
+}
+
+vecmem::data::vector_buffer<seed> write_output(
+    const traccc::memory_resource& mr, uint32_t n_seeds,
+    const internal_spacepoint<spacepoint>* const spacepoints,
+    const internal_seed* const seeds) {
+    static constexpr std::size_t threads_per_block = 256;
+
+    /*
+     * Create a "vector buffer" that we can write our output to.
+     */
+    vecmem::data::vector_buffer<seed> out(n_seeds, mr.main);
+
+    /*
+     * Fetch the current CUDA device number.
+     */
+    int device;
+    CUDA_ERROR_CHECK(cudaGetDevice(&device));
+
+    /*
+     * Instruct CUDA to prefetch the managed memory of the output to the
+     * device, to minimize the number of page faults.
+     *
+     * TODO: This needs a check to ensure that the memory is actually managed
+     * memory.
+     */
+    CUDA_ERROR_CHECK(
+        cudaMemPrefetchAsync(out.ptr(), n_seeds * sizeof(seed), device));
+
+    /*
+     * Execute the main output kernel.
+     */
+    write_output_kernel<<<(n_seeds / threads_per_block) +
+                              (n_seeds % threads_per_block == 0 ? 0 : 1),
+                          threads_per_block>>>(n_seeds, spacepoints, seeds,
+                                               out);
+
+    CUDA_ERROR_CHECK(cudaGetLastError());
+    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+
+    return out;
+}
+}  // namespace traccc::cuda
diff --git a/device/cuda/src/seeding2/seed_finding.cu b/device/cuda/src/seeding2/seed_finding.cu
new file mode 100644
index 0000000000..5b94864530
--- /dev/null
+++ b/device/cuda/src/seeding2/seed_finding.cu
@@ -0,0 +1,108 @@
+/** TRACCC library, part of the ACTS project (R&D line)
+ *
+ * (c) 2022 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#include <cooperative_groups.h>
+
+#include <limits>
+#include <memory>
+#include <traccc/cuda/seeding2/kernels/kd_tree_kernel.hpp>
+#include <traccc/cuda/seeding2/kernels/seed_finding_kernel.hpp>
+#include <traccc/cuda/seeding2/kernels/write_output_kernel.hpp>
+#include <traccc/cuda/seeding2/seed_finding.hpp>
+#include <traccc/cuda/seeding2/types/internal_seed.hpp>
+#include <traccc/cuda/seeding2/types/kd_tree.hpp>
+#include <traccc/cuda/utils/definitions.hpp>
+#include <traccc/edm/seed.hpp>
+#include <traccc/edm/spacepoint.hpp>
+#include <traccc/seeding/detail/lin_circle.hpp>
+#include <traccc/seeding/detail/seeding_config.hpp>
+#include <vecmem/memory/memory_resource.hpp>
+#include <vecmem/memory/unique_ptr.hpp>
+#include <vector>
+
+namespace traccc::cuda {
+std::tuple<vecmem::unique_alloc_ptr<internal_spacepoint<spacepoint>[]>,
+           uint32_t>
+copy_sps_to_device(vecmem::memory_resource& mr,
+                   const spacepoint_container_types::const_view& sps) {
+    std::chrono::high_resolution_clock::time_point time_ex_start =
+        std::chrono::high_resolution_clock::now();
+
+    const traccc::spacepoint_container_types::const_device p(sps);
+
+    std::vector<internal_spacepoint<spacepoint>> spacepoints;
+
+    for (std::size_t i = 0; i < p.get_items().size(); ++i) {
+        for (std::size_t j = 0; j < p.get_items().at(i).size(); ++j) {
+            const traccc::spacepoint& sp = p.get_items().at(i).at(j);
+
+            spacepoints.push_back(
+                internal_spacepoint<spacepoint>(p, {i, j}, {0.f, 0.f}));
+        }
+    }
+
+    uint32_t n_sp = spacepoints.size();
+
+    // Move the spacepoints to device memory.
+    vecmem::unique_alloc_ptr<internal_spacepoint<spacepoint>[]>
+        spacepoints_device =
+            vecmem::make_unique_alloc<internal_spacepoint<spacepoint>[]>(
+                mr, n_sp, spacepoints.data(),
+                [](internal_spacepoint<spacepoint>* dst,
+                   const internal_spacepoint<spacepoint>* src,
+                   std::size_t bytes) {
+                    CUDA_ERROR_CHECK(
+                        cudaMemcpy(dst, src, bytes, cudaMemcpyHostToDevice));
+                });
+
+    return {std::move(spacepoints_device), n_sp};
+}
+
+seed_finding2::seed_finding2(const traccc::memory_resource& mr)
+    : m_output_mr(mr), m_finder_conf() {
+    m_finder_conf.highland =
+        13.6 * std::sqrt(m_finder_conf.radLengthPerSeed) *
+        (1 + 0.038 * std::log(m_finder_conf.radLengthPerSeed));
+    float maxScatteringAngle = m_finder_conf.highland / m_finder_conf.minPt;
+    m_finder_conf.maxScatteringAngle2 = maxScatteringAngle * maxScatteringAngle;
+
+    // helix radius in homogeneous magnetic field. Units are Kilotesla, MeV
+    // and millimeter
+    // TODO: change using ACTS units
+    m_finder_conf.pTPerHelixRadius = 300. * m_finder_conf.bFieldInZ;
+    m_finder_conf.minHelixDiameter2 =
+        std::pow(m_finder_conf.minPt * 2 / m_finder_conf.pTPerHelixRadius, 2);
+    m_finder_conf.pT2perRadius =
+        std::pow(m_finder_conf.highland / m_finder_conf.pTPerHelixRadius, 2);
+}
+
+vecmem::data::vector_buffer<seed> seed_finding2::operator()(
+    const spacepoint_container_types::const_view& sps) const {
+    vecmem::cuda::device_memory_resource mr;
+    vecmem::unique_alloc_ptr<internal_spacepoint<spacepoint>[]>
+        spacepoints_device;
+    uint32_t n_sp;
+
+    std::tie(spacepoints_device, n_sp) = copy_sps_to_device(mr, sps);
+
+    vecmem::unique_alloc_ptr<kd_tree_t[]> kd_tree_device;
+    uint32_t kd_tree_size;
+
+    std::tie(kd_tree_device, kd_tree_size) =
+        create_kd_tree(mr, spacepoints_device.get(), n_sp);
+
+    vecmem::unique_alloc_ptr<internal_seed[]> seeds_device;
+    uint32_t seed_count;
+
+    std::tie(seeds_device, seed_count) =
+        run_seeding(m_finder_conf, m_filter_conf, mr, spacepoints_device.get(),
+                    n_sp, std::move(kd_tree_device), kd_tree_size);
+
+    return write_output(m_output_mr, seed_count, spacepoints_device.get(),
+                        seeds_device.get());
+}
+}  // namespace traccc::cuda