diff --git a/core/include/traccc/clusterization/clustering_config.hpp b/core/include/traccc/clusterization/clustering_config.hpp index d071f3ccf0..6f63ab1ec0 100644 --- a/core/include/traccc/clusterization/clustering_config.hpp +++ b/core/include/traccc/clusterization/clustering_config.hpp @@ -54,6 +54,13 @@ struct clustering_config { */ unsigned int backup_size_multiplier; + /** + * @brief Flag to enforce debug output. + * + * @warning This will slown down the clustering algorithm. + */ + bool enable_debug_output; + /** * @brief The maximum number of cells per partition. */ diff --git a/device/alpaka/src/clusterization/clusterization_algorithm.cpp b/device/alpaka/src/clusterization/clusterization_algorithm.cpp index 6e5f116bc9..c0478a68b3 100644 --- a/device/alpaka/src/clusterization/clusterization_algorithm.cpp +++ b/device/alpaka/src/clusterization/clusterization_algorithm.cpp @@ -60,7 +60,7 @@ struct CCLKernel { partition_start, partition_end, outi, f_view, gf_view, f_backup_view, gf_backup_view, adjc_backup_view, adjv_backup_view, backup_mutex, - barry_r, measurements_view, cell_links); + barry_r, measurements_view, cell_links, nullptr); } }; diff --git a/device/common/include/traccc/clusterization/device/ccl_debug_output.hpp b/device/common/include/traccc/clusterization/device/ccl_debug_output.hpp new file mode 100644 index 0000000000..71dbfd57ca --- /dev/null +++ b/device/common/include/traccc/clusterization/device/ccl_debug_output.hpp @@ -0,0 +1,25 @@ +/** + * traccc library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +#include + +namespace traccc::device::details { +struct ccl_debug_output { + uint32_t num_oversized_partitions; + + static ccl_debug_output init() { + ccl_debug_output rv; + + rv.num_oversized_partitions = 0; + + return rv; + } +}; +} // namespace traccc::device::details diff --git a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp index 58d709ee42..28cfff275b 100644 --- a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp +++ b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp @@ -9,6 +9,7 @@ // Project include(s). #include "traccc/clusterization/clustering_config.hpp" +#include "traccc/clusterization/device/ccl_debug_output.hpp" #include "traccc/clusterization/device/ccl_kernel_definitions.hpp" #include "traccc/definitions/hints.hpp" #include "traccc/definitions/qualifiers.hpp" @@ -53,6 +54,7 @@ namespace traccc::device { /// @param[out] measurements_view collection of measurements /// @param[out] cell_links collection of links to measurements each cell is /// put into +/// @param[out] debug_output debug output location template TRACCC_DEVICE inline void ccl_kernel( @@ -68,7 +70,8 @@ TRACCC_DEVICE inline void ccl_kernel( vecmem::data::vector_view adjv_backup_view, vecmem::device_atomic_ref backup_mutex, barrier_t& barrier, measurement_collection_types::view measurements_view, - vecmem::data::vector_view cell_links); + vecmem::data::vector_view cell_links, + details::ccl_debug_output* debug_output); } // namespace traccc::device diff --git a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp index 773261b344..bb1af0477e 100644 --- a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp +++ b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp @@ -11,6 +11,7 @@ #include "traccc/clusterization/clustering_config.hpp" #include "traccc/clusterization/device/aggregate_cluster.hpp" +#include "traccc/clusterization/device/ccl_debug_output.hpp" #include "traccc/clusterization/device/ccl_kernel_definitions.hpp" #include "traccc/clusterization/device/reduce_problem_cell.hpp" #include "traccc/device/concepts/barrier.hpp" @@ -220,7 +221,8 @@ TRACCC_DEVICE inline void ccl_kernel( vecmem::data::vector_view adjv_backup_view, vecmem::device_atomic_ref backup_mutex, barrier_t& barrier, measurement_collection_types::view measurements_view, - vecmem::data::vector_view cell_links) { + vecmem::data::vector_view cell_links, + details::ccl_debug_output* debug_output) { // Construct device containers around the views. const cell_collection_types::const_device cells_device(cells_view); const cell_module_collection_types::const_device modules_device( @@ -325,6 +327,13 @@ TRACCC_DEVICE inline void ccl_kernel( if (size > cfg.max_partition_size()) { if (thread_id.getLocalThreadIdX() == 0) { lock.lock(); + + if (debug_output) { + vecmem::device_atomic_ref + num_oversized_partitions_atm( + debug_output->num_oversized_partitions); + num_oversized_partitions_atm.fetch_add(1); + } } barrier.blockBarrier(); diff --git a/device/cuda/src/clusterization/clusterization_algorithm.cu b/device/cuda/src/clusterization/clusterization_algorithm.cu index 75304bd0a3..507615c482 100644 --- a/device/cuda/src/clusterization/clusterization_algorithm.cu +++ b/device/cuda/src/clusterization/clusterization_algorithm.cu @@ -6,12 +6,16 @@ */ // CUDA Library include(s). +#include +#include + #include "../sanity/contiguous_on.cuh" #include "../sanity/ordered_on.cuh" #include "../utils/barrier.hpp" #include "../utils/cuda_error_handling.hpp" #include "../utils/utils.hpp" #include "traccc/clusterization/clustering_config.hpp" +#include "traccc/clusterization/device/ccl_debug_output.hpp" #include "traccc/clusterization/device/ccl_kernel_definitions.hpp" #include "traccc/cuda/clusterization/clusterization_algorithm.hpp" #include "traccc/cuda/utils/thread_id.hpp" @@ -21,6 +25,9 @@ // Project include(s) #include "traccc/clusterization/device/ccl_kernel.hpp" +// System include +#include + // Vecmem include(s). #include #include @@ -40,7 +47,8 @@ __global__ void ccl_kernel( vecmem::data::vector_view gf_backup_view, vecmem::data::vector_view adjc_backup_view, vecmem::data::vector_view adjv_backup_view, - unsigned int* backup_mutex_ptr) { + unsigned int* backup_mutex_ptr, + device::details::ccl_debug_output* debug_output) { __shared__ std::size_t partition_start, partition_end; __shared__ std::size_t outi; @@ -62,7 +70,7 @@ __global__ void ccl_kernel( partition_start, partition_end, outi, f_view, gf_view, f_backup_view, gf_backup_view, adjc_backup_view, adjv_backup_view, backup_mutex, barry_r, - measurements_view, cell_links); + measurements_view, cell_links, debug_output); } } // namespace kernels @@ -132,14 +140,52 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( assert(m_config.max_cells_per_thread <= device::details::CELLS_PER_THREAD_STACK_LIMIT); + // If necessary, allocate an object for storing the debug information + vecmem::unique_alloc_ptr debug_output; + + if (m_config.enable_debug_output) { + debug_output = + vecmem::make_unique_alloc( + m_mr.main); + + device::details::ccl_debug_output empty_output = + device::details::ccl_debug_output::init(); + + TRACCC_CUDA_ERROR_CHECK( + cudaMemcpyAsync(debug_output.get(), &empty_output, + sizeof(device::details::ccl_debug_output), + cudaMemcpyHostToDevice, stream)); + } + kernels::ccl_kernel<<>>( - m_config, cells, modules, measurements, cell_links, m_f_backup, - m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get()); + stream>>>(m_config, cells, modules, measurements, + cell_links, m_f_backup, m_gf_backup, + m_adjc_backup, m_adjv_backup, + m_backup_mutex.get(), debug_output.get()); TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); + if (debug_output) { + device::details::ccl_debug_output host_output; + + TRACCC_CUDA_ERROR_CHECK( + cudaMemcpyAsync(&host_output, debug_output.get(), + sizeof(device::details::ccl_debug_output), + cudaMemcpyDeviceToHost, stream)); + + TRACCC_CUDA_ERROR_CHECK(cudaStreamSynchronize(stream)); + + if (host_output.num_oversized_partitions > 0) { + std::cout << "WARNING: @clusterization_algorithm: " + << "Clustering encountered " + << host_output.num_oversized_partitions + << " oversized partitions; if this number is too large, " + "it may cause performance problems." + << std::endl; + } + } + // Return the reconstructed measurements. return measurements; } diff --git a/device/sycl/src/clusterization/clusterization_algorithm.sycl b/device/sycl/src/clusterization/clusterization_algorithm.sycl index b1ef6f01d8..af9c9e2469 100644 --- a/device/sycl/src/clusterization/clusterization_algorithm.sycl +++ b/device/sycl/src/clusterization/clusterization_algorithm.sycl @@ -10,6 +10,7 @@ #include "../sanity/ordered_on.hpp" #include "../utils/barrier.hpp" #include "../utils/get_queue.hpp" +#include "traccc/clusterization/device/ccl_debug_output.hpp" #include "traccc/clusterization/device/ccl_kernel_definitions.hpp" #include "traccc/sycl/clusterization/clusterization_algorithm.hpp" #include "traccc/sycl/utils/thread_id.hpp" @@ -113,9 +114,28 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( assert(m_config.max_cells_per_thread <= device::details::CELLS_PER_THREAD_STACK_LIMIT); + // If necessary, allocate an object for storing the debug information + vecmem::unique_alloc_ptr debug_output; + cl::sycl::event evt_copy_debug_output_h2d; + + if (m_config.enable_debug_output) { + debug_output = + vecmem::make_unique_alloc( + m_mr.main); + + device::details::ccl_debug_output empty_output = + device::details::ccl_debug_output::init(); + + evt_copy_debug_output_h2d = details::get_queue(m_queue).memcpy( + debug_output.get(), &empty_output, + sizeof(device::details::ccl_debug_output)); + } + // Run ccl kernel - details::get_queue(m_queue) - .submit([&](::sycl::handler& h) { + cl::sycl::event evt_run_kernel = + details::get_queue(m_queue).submit([&](::sycl::handler& h) { + h.depends_on(evt_copy_debug_output_h2d); + // Allocate shared memory for the kernel. vecmem::sycl::local_accessor shared_uint(3, h); vecmem::sycl::local_accessor shared_idx( @@ -130,8 +150,8 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( gf_backup_view = vecmem::get_data(m_gf_backup), adjc_backup_view = vecmem::get_data(m_adjc_backup), adjv_backup_view = vecmem::get_data(m_adjv_backup), - mutex_ptr = m_backup_mutex.get(), - cfg = m_config](::sycl::nd_item<1> item) { + mutex_ptr = m_backup_mutex.get(), cfg = m_config, + debug_output = debug_output.get()](::sycl::nd_item<1> item) { // Construct more readable variable names. vecmem::data::vector_view f_view{ static_cast(cfg.max_partition_size()), @@ -152,15 +172,38 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( const sycl::thread_id1 thread_id(item); // Run the algorithm for this thread. - device::ccl_kernel(cfg, thread_id, cells_view, modules_view, - partition_start, partition_end, outi, - f_view, gf_view, f_backup_view, - gf_backup_view, adjc_backup_view, - adjv_backup_view, backup_mutex, barry_r, - measurements_view, cell_links_view); + device::ccl_kernel( + cfg, thread_id, cells_view, modules_view, + partition_start, partition_end, outi, f_view, gf_view, + f_backup_view, gf_backup_view, adjc_backup_view, + adjv_backup_view, backup_mutex, barry_r, + measurements_view, cell_links_view, debug_output); }); - }) - .wait_and_throw(); + }); + + cl::sycl::event evt_copy_debug_output_d2h; + + if (debug_output) { + device::details::ccl_debug_output host_output; + + evt_copy_debug_output_d2h = details::get_queue(m_queue).memcpy( + &host_output, debug_output.get(), + sizeof(device::details::ccl_debug_output), {evt_run_kernel}); + + evt_copy_debug_output_d2h.wait_and_throw(); + + if (host_output.num_oversized_partitions > 0) { + std::cout << "WARNING: @clusterization_algorithm: " + << "Clustering encountered " + << host_output.num_oversized_partitions + << " oversized partitions; if this number is too large, " + "it may cause performance problems." + << std::endl; + } + } + + cl::sycl::event::wait_and_throw( + {evt_run_kernel, evt_copy_debug_output_d2h}); // Return the reconstructed measurements. return measurements; diff --git a/examples/options/include/traccc/options/clusterization.hpp b/examples/options/include/traccc/options/clusterization.hpp index 24471be66f..56562b2c68 100644 --- a/examples/options/include/traccc/options/clusterization.hpp +++ b/examples/options/include/traccc/options/clusterization.hpp @@ -37,6 +37,7 @@ class clusterization unsigned int max_cells_per_thread; unsigned int target_cells_per_thread; unsigned int backup_size_multiplier; + bool enable_debug_output; /// @} /// Print the specific options of this class diff --git a/examples/options/src/clusterization.cpp b/examples/options/src/clusterization.cpp index 54ea642b21..328153594e 100644 --- a/examples/options/src/clusterization.cpp +++ b/examples/options/src/clusterization.cpp @@ -13,6 +13,14 @@ // System include(s). #include +namespace { +#ifndef NDEBUG +constexpr bool enable_cca_debug_default = true; +#else +constexpr bool enable_cca_debug_default = false; +#endif +} // namespace + namespace traccc::opts { clusterization::clusterization() : interface("Clusterization Options") { @@ -33,6 +41,10 @@ clusterization::clusterization() : interface("Clusterization Options") { boost::program_options::value(&backup_size_multiplier) ->default_value(256), "The size multiplier of the backup scratch space"); + m_desc.add_options()("cca-debug", + boost::program_options::value(&enable_debug_output) + ->default_value(enable_cca_debug_default), + "The size multiplier of the backup scratch space"); } clusterization::operator clustering_config() const { @@ -42,6 +54,7 @@ clusterization::operator clustering_config() const { rv.max_cells_per_thread = max_cells_per_thread; rv.target_cells_per_thread = target_cells_per_thread; rv.backup_size_multiplier = backup_size_multiplier; + rv.enable_debug_output = enable_debug_output; return rv; } @@ -54,7 +67,8 @@ std::ostream& clusterization::print_impl(std::ostream& out) const { out << " Threads per partition: " << threads_per_partition << "\n"; out << " Target cells per thread: " << target_cells_per_thread << "\n"; out << " Max cells per thread: " << max_cells_per_thread << "\n"; - out << " Scratch space size mult.: " << backup_size_multiplier; + out << " Scratch space size mult.: " << backup_size_multiplier << "\n"; + out << " Debug output printing: " << enable_debug_output << "\n"; return out; } diff --git a/tests/common/tests/cca_test.hpp b/tests/common/tests/cca_test.hpp index 0499cd05c7..3c97737ab8 100644 --- a/tests/common/tests/cca_test.hpp +++ b/tests/common/tests/cca_test.hpp @@ -43,6 +43,7 @@ inline traccc::clustering_config default_ccl_test_config() { rv.max_cells_per_thread = 16; rv.target_cells_per_thread = 8; rv.backup_size_multiplier = 256; + rv.enable_debug_output = false; return rv; } @@ -54,6 +55,7 @@ inline traccc::clustering_config tiny_ccl_test_config() { rv.max_cells_per_thread = 1; rv.target_cells_per_thread = 1; rv.backup_size_multiplier = 16384; + rv.enable_debug_output = false; return rv; }