From 7a2874163ba14a70374550549555e47a914d7110 Mon Sep 17 00:00:00 2001
From: Guilherme <g.almeida@cern.ch>
Date: Mon, 13 Mar 2023 10:57:52 +0100
Subject: [PATCH] Made cuda seeding and track parameters estimation use
 asynchronous copies and kernel launches

---
 .../traccc/cuda/seeding/seed_finding.hpp      |  17 ++-
 .../traccc/cuda/seeding/seeding_algorithm.hpp |  12 +-
 .../cuda/seeding/spacepoint_binning.hpp       |  12 +-
 .../cuda/seeding/track_params_estimation.hpp  |  15 ++-
 device/cuda/src/seeding/seed_finding.cu       | 123 ++++++++++--------
 device/cuda/src/seeding/seeding_algorithm.cpp |  10 +-
 device/cuda/src/seeding/spacepoint_binning.cu |  38 +++---
 .../src/seeding/track_params_estimation.cu    |  30 ++---
 examples/run/cuda/full_chain_algorithm.cpp    |  12 +-
 examples/run/cuda/seeding_example_cuda.cpp    |  10 +-
 examples/run/cuda/seq_example_cuda.cpp        |   6 +-
 11 files changed, 164 insertions(+), 121 deletions(-)

diff --git a/device/cuda/include/traccc/cuda/seeding/seed_finding.hpp b/device/cuda/include/traccc/cuda/seeding/seed_finding.hpp
index 5f2d050052..d4abd19036 100644
--- a/device/cuda/include/traccc/cuda/seeding/seed_finding.hpp
+++ b/device/cuda/include/traccc/cuda/seeding/seed_finding.hpp
@@ -1,6 +1,6 @@
 /** TRACCC library, part of the ACTS project (R&D line)
  *
- * (c) 2021-2022 CERN for the benefit of the ACTS project
+ * (c) 2021-2023 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -8,6 +8,7 @@
 #pragma once
 
 // Project include(s).
+#include "traccc/cuda/utils/stream.hpp"
 #include "traccc/edm/alt_seed.hpp"
 #include "traccc/edm/spacepoint.hpp"
 #include "traccc/seeding/detail/seeding_config.hpp"
@@ -34,10 +35,14 @@ class seed_finding : public algorithm<alt_seed_collection_types::buffer(
     /// @param config is seed finder configuration parameters
     /// @param filter_config is seed filter configuration parameters
     /// @param sp_grid spacepoint grid
-    /// @param mr vecmem memory resource
+    /// @param mr The memory resource(s) to use in the algorithm
+    /// @param copy The copy object to use for copying data between device
+    ///             and host memory blocks
+    /// @param str The CUDA stream to perform the operations in
     seed_finding(const seedfinder_config& config,
                  const seedfilter_config& filter_config,
-                 const traccc::memory_resource& mr);
+                 const traccc::memory_resource& mr, vecmem::copy& copy,
+                 stream& str);
 
     /// Callable operator for the seed finding
     ///
@@ -53,7 +58,11 @@ class seed_finding : public algorithm<alt_seed_collection_types::buffer(
     seedfinder_config m_seedfinder_config;
     seedfilter_config m_seedfilter_config;
     traccc::memory_resource m_mr;
-    std::unique_ptr<vecmem::copy> m_copy;
+
+    /// The copy object to use
+    vecmem::copy& m_copy;
+    /// The CUDA stream to use
+    stream& m_stream;
 };
 
 }  // namespace traccc::cuda
diff --git a/device/cuda/include/traccc/cuda/seeding/seeding_algorithm.hpp b/device/cuda/include/traccc/cuda/seeding/seeding_algorithm.hpp
index 38407e2c52..d78af1bcab 100644
--- a/device/cuda/include/traccc/cuda/seeding/seeding_algorithm.hpp
+++ b/device/cuda/include/traccc/cuda/seeding/seeding_algorithm.hpp
@@ -1,6 +1,6 @@
 /** TRACCC library, part of the ACTS project (R&D line)
  *
- * (c) 2021-2022 CERN for the benefit of the ACTS project
+ * (c) 2021-2023 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -10,6 +10,7 @@
 // Library include(s).
 #include "traccc/cuda/seeding/seed_finding.hpp"
 #include "traccc/cuda/seeding/spacepoint_binning.hpp"
+#include "traccc/cuda/utils/stream.hpp"
 
 // Project include(s).
 #include "traccc/edm/alt_seed.hpp"
@@ -17,7 +18,7 @@
 #include "traccc/utils/algorithm.hpp"
 
 // VecMem include(s).
-#include <vecmem/memory/memory_resource.hpp>
+#include <vecmem/utils/copy.hpp>
 
 // traccc library include(s).
 #include "traccc/utils/memory_resource.hpp"
@@ -32,8 +33,13 @@ class seeding_algorithm : public algorithm<alt_seed_collection_types::buffer(
     /// Constructor for the seed finding algorithm
     ///
     /// @param mr The memory resource to use
+    /// @param mr The memory resource(s) to use in the algorithm
+    /// @param copy The copy object to use for copying data between device
+    ///             and host memory blocks
+    /// @param str The CUDA stream to perform the operations in
     ///
-    seeding_algorithm(const traccc::memory_resource& mr);
+    seeding_algorithm(const traccc::memory_resource& mr, vecmem::copy& copy,
+                      stream& str);
 
     /// Operator executing the algorithm.
     ///
diff --git a/device/cuda/include/traccc/cuda/seeding/spacepoint_binning.hpp b/device/cuda/include/traccc/cuda/seeding/spacepoint_binning.hpp
index 40797197ae..02d89af21b 100644
--- a/device/cuda/include/traccc/cuda/seeding/spacepoint_binning.hpp
+++ b/device/cuda/include/traccc/cuda/seeding/spacepoint_binning.hpp
@@ -1,6 +1,6 @@
 /** TRACCC library, part of the ACTS project (R&D line)
  *
- * (c) 2021-2022 CERN for the benefit of the ACTS project
+ * (c) 2021-2023 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -8,6 +8,7 @@
 #pragma once
 
 // Project include(s).
+#include "traccc/cuda/utils/stream.hpp"
 #include "traccc/edm/spacepoint.hpp"
 #include "traccc/seeding/detail/seeding_config.hpp"
 #include "traccc/seeding/detail/spacepoint_grid.hpp"
@@ -32,7 +33,8 @@ class spacepoint_binning
     /// Constructor for the algorithm
     spacepoint_binning(const seedfinder_config& config,
                        const spacepoint_grid_config& grid_config,
-                       const traccc::memory_resource& mr);
+                       const traccc::memory_resource& mr, vecmem::copy& copy,
+                       stream& str);
 
     /// Function executing the algorithm with a a view of spacepoints
     sp_grid_buffer operator()(const spacepoint_collection_types::const_view&
@@ -43,7 +45,11 @@ class spacepoint_binning
     seedfinder_config m_config;
     std::pair<sp_grid::axis_p0_type, sp_grid::axis_p1_type> m_axes;
     traccc::memory_resource m_mr;
-    std::unique_ptr<vecmem::copy> m_copy;
+
+    /// The copy object to use
+    vecmem::copy& m_copy;
+    /// The CUDA stream to use
+    stream& m_stream;
 
 };  // class spacepoint_binning
 
diff --git a/device/cuda/include/traccc/cuda/seeding/track_params_estimation.hpp b/device/cuda/include/traccc/cuda/seeding/track_params_estimation.hpp
index 8549387776..49d3f40df6 100644
--- a/device/cuda/include/traccc/cuda/seeding/track_params_estimation.hpp
+++ b/device/cuda/include/traccc/cuda/seeding/track_params_estimation.hpp
@@ -1,6 +1,6 @@
 /** TRACCC library, part of the ACTS project (R&D line)
  *
- * (c) 2021-2022 CERN for the benefit of the ACTS project
+ * (c) 2021-2023 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -8,6 +8,7 @@
 #pragma once
 
 // Project include(s)
+#include "traccc/cuda/utils/stream.hpp"
 #include "traccc/edm/alt_seed.hpp"
 #include "traccc/edm/spacepoint.hpp"
 #include "traccc/edm/track_parameters.hpp"
@@ -30,7 +31,11 @@ struct track_params_estimation
     /// Constructor for track_params_estimation
     ///
     /// @param mr is the memory resource
-    track_params_estimation(const traccc::memory_resource& mr);
+    /// @param copy The copy object to use for copying data between device
+    ///             and host memory blocks
+    /// @param str The CUDA stream to perform the operations in
+    track_params_estimation(const traccc::memory_resource& mr,
+                            vecmem::copy& copy, stream& str);
 
     /// Callable operator for track_params_esitmation
     ///
@@ -45,8 +50,10 @@ struct track_params_estimation
     private:
     /// Memory resource used by the algorithm
     traccc::memory_resource m_mr;
-    /// Copy object used by the algorithm
-    std::unique_ptr<vecmem::copy> m_copy;
+    /// The copy object to use
+    vecmem::copy& m_copy;
+    /// The CUDA stream to use
+    stream& m_stream;
 };
 
 }  // namespace cuda
diff --git a/device/cuda/src/seeding/seed_finding.cu b/device/cuda/src/seeding/seed_finding.cu
index ce1b78b3a4..e7a7e56aab 100644
--- a/device/cuda/src/seeding/seed_finding.cu
+++ b/device/cuda/src/seeding/seed_finding.cu
@@ -6,6 +6,7 @@
  */
 
 // Local include(s).
+#include "../utils/utils.hpp"
 #include "traccc/cuda/seeding/seed_finding.hpp"
 #include "traccc/cuda/utils/definitions.hpp"
 
@@ -141,40 +142,38 @@ __global__ void select_seeds(
 
 seed_finding::seed_finding(const seedfinder_config& config,
                            const seedfilter_config& filter_config,
-                           const traccc::memory_resource& mr)
+                           const traccc::memory_resource& mr,
+                           vecmem::copy& copy, stream& str)
     : m_seedfinder_config(config.toInternalUnits()),
       m_seedfilter_config(filter_config.toInternalUnits()),
-      m_mr(mr) {
-
-    // Initialize m_copy ptr based on memory resources that were given
-    if (mr.host) {
-        m_copy = std::make_unique<vecmem::cuda::copy>();
-    } else {
-        m_copy = std::make_unique<vecmem::copy>();
-    }
-}
+      m_mr(mr),
+      m_copy(copy),
+      m_stream(str) {}
 
 seed_finding::output_type seed_finding::operator()(
     const spacepoint_collection_types::const_view& spacepoints_view,
     const sp_grid_const_view& g2_view) const {
 
+    // Get a convenience variable for the stream that we'll be using.
+    cudaStream_t stream = details::get_stream(m_stream);
+
     // Get the sizes from the grid view
-    auto grid_sizes = m_copy->get_sizes(g2_view._data_view);
+    auto grid_sizes = m_copy.get_sizes(g2_view._data_view);
 
     // Create prefix sum buffer
     vecmem::data::vector_buffer sp_grid_prefix_sum_buff =
-        make_prefix_sum_buff(grid_sizes, *m_copy, m_mr);
+        make_prefix_sum_buff(grid_sizes, m_copy, m_mr, m_stream);
 
     // Set up the doublet counter buffer.
     device::doublet_counter_collection_types::buffer doublet_counter_buffer = {
-        m_copy->get_size(sp_grid_prefix_sum_buff), 0, m_mr.main};
-    m_copy->setup(doublet_counter_buffer);
+        m_copy.get_size(sp_grid_prefix_sum_buff), 0, m_mr.main};
+    m_copy.setup(doublet_counter_buffer);
 
     // Calculate the number of threads and thread blocks to run the doublet
     // counting kernel for.
     const unsigned int nDoubletCountThreads = WARP_SIZE * 2;
     const unsigned int nDoubletCountBlocks =
-        (m_copy->get_size(sp_grid_prefix_sum_buff) + nDoubletCountThreads - 1) /
+        (m_copy.get_size(sp_grid_prefix_sum_buff) + nDoubletCountThreads - 1) /
         nDoubletCountThreads;
 
     // Counter for the total number of doublets and triplets
@@ -182,54 +181,59 @@ seed_finding::output_type seed_finding::operator()(
         globalCounter_device =
             vecmem::make_unique_alloc<device::seeding_global_counter>(
                 m_mr.main);
-    CUDA_ERROR_CHECK(cudaMemset(globalCounter_device.get(), 0,
-                                sizeof(device::seeding_global_counter)));
+    CUDA_ERROR_CHECK(cudaMemsetAsync(globalCounter_device.get(), 0,
+                                     sizeof(device::seeding_global_counter),
+                                     stream));
 
     // Count the number of doublets that we need to produce.
-    kernels::count_doublets<<<nDoubletCountBlocks, nDoubletCountThreads>>>(
+    kernels::count_doublets<<<nDoubletCountBlocks, nDoubletCountThreads, 0,
+                              stream>>>(
         m_seedfinder_config, g2_view, sp_grid_prefix_sum_buff,
         doublet_counter_buffer, (*globalCounter_device).m_nMidBot,
         (*globalCounter_device).m_nMidTop);
     CUDA_ERROR_CHECK(cudaGetLastError());
-    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+    m_stream.synchronize();
 
     // Get the summary values.
     device::seeding_global_counter globalCounter_host;
-    CUDA_ERROR_CHECK(cudaMemcpy(&globalCounter_host, globalCounter_device.get(),
-                                sizeof(device::seeding_global_counter),
-                                cudaMemcpyDeviceToHost));
+    CUDA_ERROR_CHECK(cudaMemcpyAsync(&globalCounter_host,
+                                     globalCounter_device.get(),
+                                     sizeof(device::seeding_global_counter),
+                                     cudaMemcpyDeviceToHost, stream));
+    m_stream.synchronize();
 
     // Set up the doublet counter buffers.
     device::device_doublet_collection_types::buffer doublet_buffer_mb = {
         globalCounter_host.m_nMidBot, m_mr.main};
-    m_copy->setup(doublet_buffer_mb);
+    m_copy.setup(doublet_buffer_mb);
     device::device_doublet_collection_types::buffer doublet_buffer_mt = {
         globalCounter_host.m_nMidTop, m_mr.main};
-    m_copy->setup(doublet_buffer_mt);
+    m_copy.setup(doublet_buffer_mt);
 
     // Calculate the number of threads and thread blocks to run the doublet
     // finding kernel for.
     const unsigned int nDoubletFindThreads = WARP_SIZE * 2;
     const unsigned int doublet_counter_buffer_size =
-        m_copy->get_size(doublet_counter_buffer);
+        m_copy.get_size(doublet_counter_buffer);
     const unsigned int nDoubletFindBlocks =
         (doublet_counter_buffer_size + nDoubletFindThreads - 1) /
         nDoubletFindThreads;
 
     // Find all of the spacepoint doublets.
-    kernels::find_doublets<<<nDoubletFindBlocks, nDoubletFindThreads>>>(
-        m_seedfinder_config, g2_view, doublet_counter_buffer, doublet_buffer_mb,
-        doublet_buffer_mt);
+    kernels::
+        find_doublets<<<nDoubletFindBlocks, nDoubletFindThreads, 0, stream>>>(
+            m_seedfinder_config, g2_view, doublet_counter_buffer,
+            doublet_buffer_mb, doublet_buffer_mt);
 
     // Set up the triplet counter buffers
     device::triplet_counter_spM_collection_types::buffer
         triplet_counter_spM_buffer = {doublet_counter_buffer_size, m_mr.main};
-    m_copy->setup(triplet_counter_spM_buffer);
-    m_copy->memset(triplet_counter_spM_buffer, 0);
+    m_copy.setup(triplet_counter_spM_buffer);
+    m_copy.memset(triplet_counter_spM_buffer, 0);
     device::triplet_counter_collection_types::buffer
         triplet_counter_midBot_buffer = {globalCounter_host.m_nMidBot, 0,
                                          m_mr.main};
-    m_copy->setup(triplet_counter_midBot_buffer);
+    m_copy.setup(triplet_counter_midBot_buffer);
 
     // Calculate the number of threads and thread blocks to run the doublet
     // counting kernel for.
@@ -240,10 +244,11 @@ seed_finding::output_type seed_finding::operator()(
 
     // Wait here for the find doublets kernel to finish
     CUDA_ERROR_CHECK(cudaGetLastError());
-    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+    m_stream.synchronize();
 
     // Count the number of triplets that we need to produce.
-    kernels::count_triplets<<<nTripletCountBlocks, nTripletCountThreads>>>(
+    kernels::count_triplets<<<nTripletCountBlocks, nTripletCountThreads, 0,
+                              stream>>>(
         m_seedfinder_config, g2_view, doublet_counter_buffer, doublet_buffer_mb,
         doublet_buffer_mt, triplet_counter_spM_buffer,
         triplet_counter_midBot_buffer);
@@ -257,37 +262,42 @@ seed_finding::output_type seed_finding::operator()(
 
     // Wait here for the count triplets kernel to finish
     CUDA_ERROR_CHECK(cudaGetLastError());
-    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+    m_stream.synchronize();
 
     // Reduce the triplet counts per spM.
-    kernels::reduce_triplet_counts<<<nTcReductionBlocks, nTcReductionThreads>>>(
+    kernels::reduce_triplet_counts<<<nTcReductionBlocks, nTcReductionThreads, 0,
+                                     stream>>>(
         doublet_counter_buffer, triplet_counter_spM_buffer,
         (*globalCounter_device).m_nTriplets);
     CUDA_ERROR_CHECK(cudaGetLastError());
-    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+    m_stream.synchronize();
 
-    CUDA_ERROR_CHECK(cudaMemcpy(&globalCounter_host, globalCounter_device.get(),
-                                sizeof(device::seeding_global_counter),
-                                cudaMemcpyDeviceToHost));
+    CUDA_ERROR_CHECK(cudaMemcpyAsync(&globalCounter_host,
+                                     globalCounter_device.get(),
+                                     sizeof(device::seeding_global_counter),
+                                     cudaMemcpyDeviceToHost, stream));
+    m_stream.synchronize();
 
     // Set up the triplet buffer.
     device::device_triplet_collection_types::buffer triplet_buffer = {
         globalCounter_host.m_nTriplets, m_mr.main};
-    m_copy->setup(triplet_buffer);
+    m_copy.setup(triplet_buffer);
 
     // Calculate the number of threads and thread blocks to run the triplet
     // finding kernel for.
     const unsigned int nTripletFindThreads = WARP_SIZE * 2;
     const unsigned int nTripletFindBlocks =
-        (m_copy->get_size(triplet_counter_midBot_buffer) + nTripletFindThreads -
+        (m_copy.get_size(triplet_counter_midBot_buffer) + nTripletFindThreads -
          1) /
         nTripletFindThreads;
 
     // Find all of the spacepoint triplets.
-    kernels::find_triplets<<<nTripletFindBlocks, nTripletFindThreads>>>(
-        m_seedfinder_config, m_seedfilter_config, g2_view,
-        doublet_counter_buffer, doublet_buffer_mt, triplet_counter_spM_buffer,
-        triplet_counter_midBot_buffer, triplet_buffer);
+    kernels::
+        find_triplets<<<nTripletFindBlocks, nTripletFindThreads, 0, stream>>>(
+            m_seedfinder_config, m_seedfilter_config, g2_view,
+            doublet_counter_buffer, doublet_buffer_mt,
+            triplet_counter_spM_buffer, triplet_counter_midBot_buffer,
+            triplet_buffer);
 
     // Calculate the number of threads and thread blocks to run the weight
     // updating kernel for.
@@ -298,20 +308,20 @@ seed_finding::output_type seed_finding::operator()(
 
     // Wait here for the find triplets kernel to finish
     CUDA_ERROR_CHECK(cudaGetLastError());
-    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+    m_stream.synchronize();
 
     // Update the weights of all spacepoint triplets.
     kernels::update_triplet_weights<<<
         nWeightUpdatingBlocks, nWeightUpdatingThreads,
         sizeof(scalar) * m_seedfilter_config.compatSeedLimit *
-            nWeightUpdatingThreads>>>(
-        m_seedfilter_config, g2_view, triplet_counter_spM_buffer,
-        triplet_counter_midBot_buffer, triplet_buffer);
+            nWeightUpdatingThreads,
+        stream>>>(m_seedfilter_config, g2_view, triplet_counter_spM_buffer,
+                  triplet_counter_midBot_buffer, triplet_buffer);
 
     // Create result object: collection of seeds
     alt_seed_collection_types::buffer seed_buffer(
         globalCounter_host.m_nTriplets, 0, m_mr.main);
-    m_copy->setup(seed_buffer);
+    m_copy.setup(seed_buffer);
 
     // Calculate the number of threads and thread blocks to run the seed
     // selecting kernel for.
@@ -322,18 +332,19 @@ seed_finding::output_type seed_finding::operator()(
 
     // Wait here for the update triplet weights kernel to finish
     CUDA_ERROR_CHECK(cudaGetLastError());
-    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+    m_stream.synchronize();
 
     // Create seeds out of selected triplets
     kernels::select_seeds<<<nSeedSelectingBlocks, nSeedSelectingThreads,
                             sizeof(triplet) *
                                 m_seedfilter_config.max_triplets_per_spM *
-                                nSeedSelectingThreads>>>(
-        m_seedfilter_config, spacepoints_view, g2_view,
-        triplet_counter_spM_buffer, triplet_counter_midBot_buffer,
-        triplet_buffer, seed_buffer);
+                                nSeedSelectingThreads,
+                            stream>>>(m_seedfilter_config, spacepoints_view,
+                                      g2_view, triplet_counter_spM_buffer,
+                                      triplet_counter_midBot_buffer,
+                                      triplet_buffer, seed_buffer);
     CUDA_ERROR_CHECK(cudaGetLastError());
-    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+    m_stream.synchronize();
 
     return seed_buffer;
 }
diff --git a/device/cuda/src/seeding/seeding_algorithm.cpp b/device/cuda/src/seeding/seeding_algorithm.cpp
index 4828680878..879aaec65c 100644
--- a/device/cuda/src/seeding/seeding_algorithm.cpp
+++ b/device/cuda/src/seeding/seeding_algorithm.cpp
@@ -1,6 +1,6 @@
 /** TRACCC library, part of the ACTS project (R&D line)
  *
- * (c) 2021-2022 CERN for the benefit of the ACTS project
+ * (c) 2021-2023 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -58,10 +58,12 @@ traccc::spacepoint_grid_config default_spacepoint_grid_config() {
 
 namespace traccc::cuda {
 
-seeding_algorithm::seeding_algorithm(const traccc::memory_resource& mr)
+seeding_algorithm::seeding_algorithm(const traccc::memory_resource& mr,
+                                     vecmem::copy& copy, stream& str)
     : m_spacepoint_binning(default_seedfinder_config(),
-                           default_spacepoint_grid_config(), mr),
-      m_seed_finding(default_seedfinder_config(), seedfilter_config(), mr) {}
+                           default_spacepoint_grid_config(), mr, copy, str),
+      m_seed_finding(default_seedfinder_config(), seedfilter_config(), mr, copy,
+                     str) {}
 
 seeding_algorithm::output_type seeding_algorithm::operator()(
     const spacepoint_collection_types::const_view& spacepoints_view) const {
diff --git a/device/cuda/src/seeding/spacepoint_binning.cu b/device/cuda/src/seeding/spacepoint_binning.cu
index 441ef474f0..62d3b29e8c 100644
--- a/device/cuda/src/seeding/spacepoint_binning.cu
+++ b/device/cuda/src/seeding/spacepoint_binning.cu
@@ -1,11 +1,12 @@
 /** TRACCC library, part of the ACTS project (R&D line)
  *
- * (c) 2021-2022 CERN for the benefit of the ACTS project
+ * (c) 2021-2023 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
 
 // Local include(s).
+#include "../utils/utils.hpp"
 #include "traccc/cuda/seeding/spacepoint_binning.hpp"
 #include "traccc/cuda/utils/definitions.hpp"
 
@@ -45,33 +46,30 @@ __global__ void populate_grid(
 
 spacepoint_binning::spacepoint_binning(
     const seedfinder_config& config, const spacepoint_grid_config& grid_config,
-    const traccc::memory_resource& mr)
+    const traccc::memory_resource& mr, vecmem::copy& copy, stream& str)
     : m_config(config.toInternalUnits()),
       m_axes(get_axes(grid_config.toInternalUnits(),
                       (mr.host ? *(mr.host) : mr.main))),
-      m_mr(mr) {
-
-    // Initialize m_copy ptr based on memory resources that were given
-    if (mr.host) {
-        m_copy = std::make_unique<vecmem::cuda::copy>();
-    } else {
-        m_copy = std::make_unique<vecmem::copy>();
-    }
-}
+      m_mr(mr),
+      m_copy(copy),
+      m_stream(str) {}
 
 sp_grid_buffer spacepoint_binning::operator()(
     const spacepoint_collection_types::const_view& spacepoints_view) const {
 
+    // Get a convenience variable for the stream that we'll be using.
+    cudaStream_t stream = details::get_stream(m_stream);
+
     // Get the spacepoint sizes from the view
-    auto sp_size = m_copy->get_size(spacepoints_view);
+    auto sp_size = m_copy.get_size(spacepoints_view);
 
     // Set up the container that will be filled with the required capacities for
     // the spacepoint grid.
     const std::size_t grid_bins = m_axes.first.n_bins * m_axes.second.n_bins;
     vecmem::data::vector_buffer<unsigned int> grid_capacities_buff(grid_bins,
                                                                    m_mr.main);
-    m_copy->setup(grid_capacities_buff);
-    m_copy->memset(grid_capacities_buff, 0);
+    m_copy.setup(grid_capacities_buff);
+    m_copy.memset(grid_capacities_buff, 0);
     vecmem::data::vector_view<unsigned int> grid_capacities_view =
         grid_capacities_buff;
 
@@ -80,31 +78,31 @@ sp_grid_buffer spacepoint_binning::operator()(
     const unsigned int num_blocks = (sp_size + num_threads - 1) / num_threads;
 
     // Fill the grid capacity container.
-    kernels::count_grid_capacities<<<num_blocks, num_threads>>>(
+    kernels::count_grid_capacities<<<num_blocks, num_threads, 0, stream>>>(
         m_config, m_axes.first, m_axes.second, spacepoints_view,
         grid_capacities_view);
     CUDA_ERROR_CHECK(cudaGetLastError());
-    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
 
     // Copy grid capacities back to the host
     vecmem::vector<unsigned int> grid_capacities_host(m_mr.host ? m_mr.host
                                                                 : &(m_mr.main));
-    (*m_copy)(grid_capacities_buff, grid_capacities_host);
+    m_copy(grid_capacities_buff, grid_capacities_host);
 
+    m_stream.synchronize();
     // Create the grid buffer.
     sp_grid_buffer grid_buffer(
         m_axes.first, m_axes.second, std::vector<std::size_t>(grid_bins, 0),
         std::vector<std::size_t>(grid_capacities_host.begin(),
                                  grid_capacities_host.end()),
         m_mr.main, m_mr.host);
-    m_copy->setup(grid_buffer._buffer);
+    m_copy.setup(grid_buffer._buffer);
     sp_grid_view grid_view = grid_buffer;
 
     // Populate the grid.
-    kernels::populate_grid<<<num_blocks, num_threads>>>(
+    kernels::populate_grid<<<num_blocks, num_threads, 0, stream>>>(
         m_config, spacepoints_view, grid_view);
     CUDA_ERROR_CHECK(cudaGetLastError());
-    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+    m_stream.synchronize();
 
     // Return the freshly filled buffer.
     return grid_buffer;
diff --git a/device/cuda/src/seeding/track_params_estimation.cu b/device/cuda/src/seeding/track_params_estimation.cu
index 5990c809fc..c57dc2993e 100644
--- a/device/cuda/src/seeding/track_params_estimation.cu
+++ b/device/cuda/src/seeding/track_params_estimation.cu
@@ -1,13 +1,16 @@
 /** TRACCC library, part of the ACTS project (R&D line)
  *
- * (c) 2021-2022 CERN for the benefit of the ACTS project
+ * (c) 2021-2023 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
 
-// Project include(s).
+// Local include(s).
+#include "../utils/utils.hpp"
 #include "traccc/cuda/seeding/track_params_estimation.hpp"
 #include "traccc/cuda/utils/definitions.hpp"
+
+// Project include(s).
 #include "traccc/seeding/device/estimate_track_params.hpp"
 
 // VecMem include(s).
@@ -29,28 +32,23 @@ __global__ void estimate_track_params(
 }  // namespace kernels
 
 track_params_estimation::track_params_estimation(
-    const traccc::memory_resource& mr)
-    : m_mr(mr) {
-
-    // Initialize m_copy ptr based on memory resources that were given
-    if (mr.host) {
-        m_copy = std::make_unique<vecmem::cuda::copy>();
-    } else {
-        m_copy = std::make_unique<vecmem::copy>();
-    }
-}
+    const traccc::memory_resource& mr, vecmem::copy& copy, stream& str)
+    : m_mr(mr), m_copy(copy), m_stream(str) {}
 
 track_params_estimation::output_type track_params_estimation::operator()(
     const spacepoint_collection_types::const_view& spacepoints_view,
     const alt_seed_collection_types::const_view& seeds_view) const {
 
+    // Get a convenience variable for the stream that we'll be using.
+    cudaStream_t stream = details::get_stream(m_stream);
+
     // Get the size of the seeds view
-    const std::size_t seeds_size = m_copy->get_size(seeds_view);
+    const std::size_t seeds_size = m_copy.get_size(seeds_view);
 
     // Create device buffer for the parameters
     bound_track_parameters_collection_types::buffer params_buffer(seeds_size,
                                                                   m_mr.main);
-    m_copy->setup(params_buffer);
+    m_copy.setup(params_buffer);
 
     // Check if anything needs to be done.
     if (seeds_size == 0) {
@@ -67,12 +65,12 @@ track_params_estimation::output_type track_params_estimation::operator()(
     unsigned int num_blocks = (seeds_size + num_threads - 1) / num_threads;
 
     // run the kernel
-    kernels::estimate_track_params<<<num_blocks, num_threads>>>(
+    kernels::estimate_track_params<<<num_blocks, num_threads, 0, stream>>>(
         spacepoints_view, seeds_view, params_buffer);
 
     // cuda error check
     CUDA_ERROR_CHECK(cudaGetLastError());
-    CUDA_ERROR_CHECK(cudaDeviceSynchronize());
+    m_stream.synchronize();
 
     return params_buffer;
 }
diff --git a/examples/run/cuda/full_chain_algorithm.cpp b/examples/run/cuda/full_chain_algorithm.cpp
index 3964132a10..897e759fee 100644
--- a/examples/run/cuda/full_chain_algorithm.cpp
+++ b/examples/run/cuda/full_chain_algorithm.cpp
@@ -1,6 +1,6 @@
 /** TRACCC library, part of the ACTS project (R&D line)
  *
- * (c) 2022 CERN for the benefit of the ACTS project
+ * (c) 2022-2023 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -39,9 +39,10 @@ full_chain_algorithm::full_chain_algorithm(
       m_target_cells_per_partition(target_cells_per_partition),
       m_clusterization(memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy,
                        m_stream, m_target_cells_per_partition),
-      m_seeding(memory_resource{*m_cached_device_mr, &m_host_mr}),
+      m_seeding(memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy,
+                m_stream),
       m_track_parameter_estimation(
-          memory_resource{*m_cached_device_mr, &m_host_mr}) {
+          memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy, m_stream) {
 
     // Tell the user what device is being used.
     int device = 0;
@@ -63,9 +64,10 @@ full_chain_algorithm::full_chain_algorithm(const full_chain_algorithm& parent)
       m_target_cells_per_partition(parent.m_target_cells_per_partition),
       m_clusterization(memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy,
                        m_stream, m_target_cells_per_partition),
-      m_seeding(memory_resource{*m_cached_device_mr, &m_host_mr}),
+      m_seeding(memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy,
+                m_stream),
       m_track_parameter_estimation(
-          memory_resource{*m_cached_device_mr, &m_host_mr}) {}
+          memory_resource{*m_cached_device_mr, &m_host_mr}, m_copy, m_stream) {}
 
 full_chain_algorithm::~full_chain_algorithm() {
 
diff --git a/examples/run/cuda/seeding_example_cuda.cpp b/examples/run/cuda/seeding_example_cuda.cpp
index 04d8a65cb0..95cd2c14b6 100644
--- a/examples/run/cuda/seeding_example_cuda.cpp
+++ b/examples/run/cuda/seeding_example_cuda.cpp
@@ -1,6 +1,6 @@
 /** TRACCC library, part of the ACTS project (R&D line)
  *
- * (c) 2021-2022 CERN for the benefit of the ACTS project
+ * (c) 2021-2023 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -23,6 +23,7 @@
 // VecMem include(s).
 #include <vecmem/memory/cuda/device_memory_resource.hpp>
 #include <vecmem/memory/host_memory_resource.hpp>
+#include <vecmem/utils/cuda/async_copy.hpp>
 #include <vecmem/utils/cuda/copy.hpp>
 
 // System include(s).
@@ -52,10 +53,13 @@ int seq_run(const traccc::seeding_input_config& i_cfg,
     traccc::seeding_algorithm sa(host_mr);
     traccc::track_params_estimation tp(host_mr);
 
+    traccc::cuda::stream stream;
+
     vecmem::cuda::copy copy;
+    vecmem::cuda::async_copy async_copy{stream.cudaStream()};
 
-    traccc::cuda::seeding_algorithm sa_cuda{mr};
-    traccc::cuda::track_params_estimation tp_cuda{mr};
+    traccc::cuda::seeding_algorithm sa_cuda{mr, async_copy, stream};
+    traccc::cuda::track_params_estimation tp_cuda{mr, async_copy, stream};
 
     // performance writer
     traccc::seeding_performance_writer sd_performance_writer(
diff --git a/examples/run/cuda/seq_example_cuda.cpp b/examples/run/cuda/seq_example_cuda.cpp
index 452dbd000f..55ee204114 100644
--- a/examples/run/cuda/seq_example_cuda.cpp
+++ b/examples/run/cuda/seq_example_cuda.cpp
@@ -1,6 +1,6 @@
 /** TRACCC library, part of the ACTS project (R&D line)
  *
- * (c) 2021-2022 CERN for the benefit of the ACTS project
+ * (c) 2021-2023 CERN for the benefit of the ACTS project
  *
  * Mozilla Public License Version 2.0
  */
@@ -79,8 +79,8 @@ int seq_run(const traccc::full_tracking_input_config& i_cfg,
 
     traccc::cuda::clusterization_algorithm ca_cuda(
         mr, async_copy, stream, common_opts.target_cells_per_partition);
-    traccc::cuda::seeding_algorithm sa_cuda(mr);
-    traccc::cuda::track_params_estimation tp_cuda(mr);
+    traccc::cuda::seeding_algorithm sa_cuda(mr, async_copy, stream);
+    traccc::cuda::track_params_estimation tp_cuda(mr, async_copy, stream);
 
     // performance writer
     traccc::seeding_performance_writer sd_performance_writer(