From 23b32665b27cc10f0fff311b4a0c1b9fe47c795d Mon Sep 17 00:00:00 2001 From: Stephen Nicholas Swatman Date: Wed, 10 Jul 2024 20:12:50 +0200 Subject: [PATCH] Add debug output to clustering algorithms In #595, I equipped the CCA code with some edge case handling which allows it to handle oversized partitions. Although this makes sure the algorithm works, it also risks to slow down execution. In order to better understand how much performance we might be losing, this commit adds the ability for the SYCL and CUDA algorithms to print some warnings if they ever encounter this edge case. --- .../clusterization/clustering_config.hpp | 7 ++ .../clusterization_algorithm.cpp | 2 +- .../device/ccl_debug_output.hpp | 25 +++++++ .../clusterization/device/ccl_kernel.hpp | 5 +- .../clusterization/device/impl/ccl_kernel.ipp | 11 ++- .../clusterization_algorithm.cu | 56 ++++++++++++++-- .../clusterization_algorithm.sycl | 67 +++++++++++++++---- .../include/traccc/options/clusterization.hpp | 1 + examples/options/src/clusterization.cpp | 16 ++++- tests/common/tests/cca_test.hpp | 2 + 10 files changed, 171 insertions(+), 21 deletions(-) create mode 100644 device/common/include/traccc/clusterization/device/ccl_debug_output.hpp diff --git a/core/include/traccc/clusterization/clustering_config.hpp b/core/include/traccc/clusterization/clustering_config.hpp index d071f3ccf0..6f63ab1ec0 100644 --- a/core/include/traccc/clusterization/clustering_config.hpp +++ b/core/include/traccc/clusterization/clustering_config.hpp @@ -54,6 +54,13 @@ struct clustering_config { */ unsigned int backup_size_multiplier; + /** + * @brief Flag to enforce debug output. + * + * @warning This will slown down the clustering algorithm. + */ + bool enable_debug_output; + /** * @brief The maximum number of cells per partition. */ diff --git a/device/alpaka/src/clusterization/clusterization_algorithm.cpp b/device/alpaka/src/clusterization/clusterization_algorithm.cpp index 6e5f116bc9..c0478a68b3 100644 --- a/device/alpaka/src/clusterization/clusterization_algorithm.cpp +++ b/device/alpaka/src/clusterization/clusterization_algorithm.cpp @@ -60,7 +60,7 @@ struct CCLKernel { partition_start, partition_end, outi, f_view, gf_view, f_backup_view, gf_backup_view, adjc_backup_view, adjv_backup_view, backup_mutex, - barry_r, measurements_view, cell_links); + barry_r, measurements_view, cell_links, nullptr); } }; diff --git a/device/common/include/traccc/clusterization/device/ccl_debug_output.hpp b/device/common/include/traccc/clusterization/device/ccl_debug_output.hpp new file mode 100644 index 0000000000..71dbfd57ca --- /dev/null +++ b/device/common/include/traccc/clusterization/device/ccl_debug_output.hpp @@ -0,0 +1,25 @@ +/** + * traccc library, part of the ACTS project (R&D line) + * + * (c) 2024 CERN for the benefit of the ACTS project + * + * Mozilla Public License Version 2.0 + */ + +#pragma once + +#include + +namespace traccc::device::details { +struct ccl_debug_output { + uint32_t num_oversized_partitions; + + static ccl_debug_output init() { + ccl_debug_output rv; + + rv.num_oversized_partitions = 0; + + return rv; + } +}; +} // namespace traccc::device::details diff --git a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp index 58d709ee42..28cfff275b 100644 --- a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp +++ b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp @@ -9,6 +9,7 @@ // Project include(s). #include "traccc/clusterization/clustering_config.hpp" +#include "traccc/clusterization/device/ccl_debug_output.hpp" #include "traccc/clusterization/device/ccl_kernel_definitions.hpp" #include "traccc/definitions/hints.hpp" #include "traccc/definitions/qualifiers.hpp" @@ -53,6 +54,7 @@ namespace traccc::device { /// @param[out] measurements_view collection of measurements /// @param[out] cell_links collection of links to measurements each cell is /// put into +/// @param[out] debug_output debug output location template TRACCC_DEVICE inline void ccl_kernel( @@ -68,7 +70,8 @@ TRACCC_DEVICE inline void ccl_kernel( vecmem::data::vector_view adjv_backup_view, vecmem::device_atomic_ref backup_mutex, barrier_t& barrier, measurement_collection_types::view measurements_view, - vecmem::data::vector_view cell_links); + vecmem::data::vector_view cell_links, + details::ccl_debug_output* debug_output); } // namespace traccc::device diff --git a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp index 773261b344..bb1af0477e 100644 --- a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp +++ b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp @@ -11,6 +11,7 @@ #include "traccc/clusterization/clustering_config.hpp" #include "traccc/clusterization/device/aggregate_cluster.hpp" +#include "traccc/clusterization/device/ccl_debug_output.hpp" #include "traccc/clusterization/device/ccl_kernel_definitions.hpp" #include "traccc/clusterization/device/reduce_problem_cell.hpp" #include "traccc/device/concepts/barrier.hpp" @@ -220,7 +221,8 @@ TRACCC_DEVICE inline void ccl_kernel( vecmem::data::vector_view adjv_backup_view, vecmem::device_atomic_ref backup_mutex, barrier_t& barrier, measurement_collection_types::view measurements_view, - vecmem::data::vector_view cell_links) { + vecmem::data::vector_view cell_links, + details::ccl_debug_output* debug_output) { // Construct device containers around the views. const cell_collection_types::const_device cells_device(cells_view); const cell_module_collection_types::const_device modules_device( @@ -325,6 +327,13 @@ TRACCC_DEVICE inline void ccl_kernel( if (size > cfg.max_partition_size()) { if (thread_id.getLocalThreadIdX() == 0) { lock.lock(); + + if (debug_output) { + vecmem::device_atomic_ref + num_oversized_partitions_atm( + debug_output->num_oversized_partitions); + num_oversized_partitions_atm.fetch_add(1); + } } barrier.blockBarrier(); diff --git a/device/cuda/src/clusterization/clusterization_algorithm.cu b/device/cuda/src/clusterization/clusterization_algorithm.cu index 75304bd0a3..507615c482 100644 --- a/device/cuda/src/clusterization/clusterization_algorithm.cu +++ b/device/cuda/src/clusterization/clusterization_algorithm.cu @@ -6,12 +6,16 @@ */ // CUDA Library include(s). +#include +#include + #include "../sanity/contiguous_on.cuh" #include "../sanity/ordered_on.cuh" #include "../utils/barrier.hpp" #include "../utils/cuda_error_handling.hpp" #include "../utils/utils.hpp" #include "traccc/clusterization/clustering_config.hpp" +#include "traccc/clusterization/device/ccl_debug_output.hpp" #include "traccc/clusterization/device/ccl_kernel_definitions.hpp" #include "traccc/cuda/clusterization/clusterization_algorithm.hpp" #include "traccc/cuda/utils/thread_id.hpp" @@ -21,6 +25,9 @@ // Project include(s) #include "traccc/clusterization/device/ccl_kernel.hpp" +// System include +#include + // Vecmem include(s). #include #include @@ -40,7 +47,8 @@ __global__ void ccl_kernel( vecmem::data::vector_view gf_backup_view, vecmem::data::vector_view adjc_backup_view, vecmem::data::vector_view adjv_backup_view, - unsigned int* backup_mutex_ptr) { + unsigned int* backup_mutex_ptr, + device::details::ccl_debug_output* debug_output) { __shared__ std::size_t partition_start, partition_end; __shared__ std::size_t outi; @@ -62,7 +70,7 @@ __global__ void ccl_kernel( partition_start, partition_end, outi, f_view, gf_view, f_backup_view, gf_backup_view, adjc_backup_view, adjv_backup_view, backup_mutex, barry_r, - measurements_view, cell_links); + measurements_view, cell_links, debug_output); } } // namespace kernels @@ -132,14 +140,52 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( assert(m_config.max_cells_per_thread <= device::details::CELLS_PER_THREAD_STACK_LIMIT); + // If necessary, allocate an object for storing the debug information + vecmem::unique_alloc_ptr debug_output; + + if (m_config.enable_debug_output) { + debug_output = + vecmem::make_unique_alloc( + m_mr.main); + + device::details::ccl_debug_output empty_output = + device::details::ccl_debug_output::init(); + + TRACCC_CUDA_ERROR_CHECK( + cudaMemcpyAsync(debug_output.get(), &empty_output, + sizeof(device::details::ccl_debug_output), + cudaMemcpyHostToDevice, stream)); + } + kernels::ccl_kernel<<>>( - m_config, cells, modules, measurements, cell_links, m_f_backup, - m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get()); + stream>>>(m_config, cells, modules, measurements, + cell_links, m_f_backup, m_gf_backup, + m_adjc_backup, m_adjv_backup, + m_backup_mutex.get(), debug_output.get()); TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); + if (debug_output) { + device::details::ccl_debug_output host_output; + + TRACCC_CUDA_ERROR_CHECK( + cudaMemcpyAsync(&host_output, debug_output.get(), + sizeof(device::details::ccl_debug_output), + cudaMemcpyDeviceToHost, stream)); + + TRACCC_CUDA_ERROR_CHECK(cudaStreamSynchronize(stream)); + + if (host_output.num_oversized_partitions > 0) { + std::cout << "WARNING: @clusterization_algorithm: " + << "Clustering encountered " + << host_output.num_oversized_partitions + << " oversized partitions; if this number is too large, " + "it may cause performance problems." + << std::endl; + } + } + // Return the reconstructed measurements. return measurements; } diff --git a/device/sycl/src/clusterization/clusterization_algorithm.sycl b/device/sycl/src/clusterization/clusterization_algorithm.sycl index b1ef6f01d8..af9c9e2469 100644 --- a/device/sycl/src/clusterization/clusterization_algorithm.sycl +++ b/device/sycl/src/clusterization/clusterization_algorithm.sycl @@ -10,6 +10,7 @@ #include "../sanity/ordered_on.hpp" #include "../utils/barrier.hpp" #include "../utils/get_queue.hpp" +#include "traccc/clusterization/device/ccl_debug_output.hpp" #include "traccc/clusterization/device/ccl_kernel_definitions.hpp" #include "traccc/sycl/clusterization/clusterization_algorithm.hpp" #include "traccc/sycl/utils/thread_id.hpp" @@ -113,9 +114,28 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( assert(m_config.max_cells_per_thread <= device::details::CELLS_PER_THREAD_STACK_LIMIT); + // If necessary, allocate an object for storing the debug information + vecmem::unique_alloc_ptr debug_output; + cl::sycl::event evt_copy_debug_output_h2d; + + if (m_config.enable_debug_output) { + debug_output = + vecmem::make_unique_alloc( + m_mr.main); + + device::details::ccl_debug_output empty_output = + device::details::ccl_debug_output::init(); + + evt_copy_debug_output_h2d = details::get_queue(m_queue).memcpy( + debug_output.get(), &empty_output, + sizeof(device::details::ccl_debug_output)); + } + // Run ccl kernel - details::get_queue(m_queue) - .submit([&](::sycl::handler& h) { + cl::sycl::event evt_run_kernel = + details::get_queue(m_queue).submit([&](::sycl::handler& h) { + h.depends_on(evt_copy_debug_output_h2d); + // Allocate shared memory for the kernel. vecmem::sycl::local_accessor shared_uint(3, h); vecmem::sycl::local_accessor shared_idx( @@ -130,8 +150,8 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( gf_backup_view = vecmem::get_data(m_gf_backup), adjc_backup_view = vecmem::get_data(m_adjc_backup), adjv_backup_view = vecmem::get_data(m_adjv_backup), - mutex_ptr = m_backup_mutex.get(), - cfg = m_config](::sycl::nd_item<1> item) { + mutex_ptr = m_backup_mutex.get(), cfg = m_config, + debug_output = debug_output.get()](::sycl::nd_item<1> item) { // Construct more readable variable names. vecmem::data::vector_view f_view{ static_cast(cfg.max_partition_size()), @@ -152,15 +172,38 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()( const sycl::thread_id1 thread_id(item); // Run the algorithm for this thread. - device::ccl_kernel(cfg, thread_id, cells_view, modules_view, - partition_start, partition_end, outi, - f_view, gf_view, f_backup_view, - gf_backup_view, adjc_backup_view, - adjv_backup_view, backup_mutex, barry_r, - measurements_view, cell_links_view); + device::ccl_kernel( + cfg, thread_id, cells_view, modules_view, + partition_start, partition_end, outi, f_view, gf_view, + f_backup_view, gf_backup_view, adjc_backup_view, + adjv_backup_view, backup_mutex, barry_r, + measurements_view, cell_links_view, debug_output); }); - }) - .wait_and_throw(); + }); + + cl::sycl::event evt_copy_debug_output_d2h; + + if (debug_output) { + device::details::ccl_debug_output host_output; + + evt_copy_debug_output_d2h = details::get_queue(m_queue).memcpy( + &host_output, debug_output.get(), + sizeof(device::details::ccl_debug_output), {evt_run_kernel}); + + evt_copy_debug_output_d2h.wait_and_throw(); + + if (host_output.num_oversized_partitions > 0) { + std::cout << "WARNING: @clusterization_algorithm: " + << "Clustering encountered " + << host_output.num_oversized_partitions + << " oversized partitions; if this number is too large, " + "it may cause performance problems." + << std::endl; + } + } + + cl::sycl::event::wait_and_throw( + {evt_run_kernel, evt_copy_debug_output_d2h}); // Return the reconstructed measurements. return measurements; diff --git a/examples/options/include/traccc/options/clusterization.hpp b/examples/options/include/traccc/options/clusterization.hpp index 24471be66f..56562b2c68 100644 --- a/examples/options/include/traccc/options/clusterization.hpp +++ b/examples/options/include/traccc/options/clusterization.hpp @@ -37,6 +37,7 @@ class clusterization unsigned int max_cells_per_thread; unsigned int target_cells_per_thread; unsigned int backup_size_multiplier; + bool enable_debug_output; /// @} /// Print the specific options of this class diff --git a/examples/options/src/clusterization.cpp b/examples/options/src/clusterization.cpp index 54ea642b21..328153594e 100644 --- a/examples/options/src/clusterization.cpp +++ b/examples/options/src/clusterization.cpp @@ -13,6 +13,14 @@ // System include(s). #include +namespace { +#ifndef NDEBUG +constexpr bool enable_cca_debug_default = true; +#else +constexpr bool enable_cca_debug_default = false; +#endif +} // namespace + namespace traccc::opts { clusterization::clusterization() : interface("Clusterization Options") { @@ -33,6 +41,10 @@ clusterization::clusterization() : interface("Clusterization Options") { boost::program_options::value(&backup_size_multiplier) ->default_value(256), "The size multiplier of the backup scratch space"); + m_desc.add_options()("cca-debug", + boost::program_options::value(&enable_debug_output) + ->default_value(enable_cca_debug_default), + "The size multiplier of the backup scratch space"); } clusterization::operator clustering_config() const { @@ -42,6 +54,7 @@ clusterization::operator clustering_config() const { rv.max_cells_per_thread = max_cells_per_thread; rv.target_cells_per_thread = target_cells_per_thread; rv.backup_size_multiplier = backup_size_multiplier; + rv.enable_debug_output = enable_debug_output; return rv; } @@ -54,7 +67,8 @@ std::ostream& clusterization::print_impl(std::ostream& out) const { out << " Threads per partition: " << threads_per_partition << "\n"; out << " Target cells per thread: " << target_cells_per_thread << "\n"; out << " Max cells per thread: " << max_cells_per_thread << "\n"; - out << " Scratch space size mult.: " << backup_size_multiplier; + out << " Scratch space size mult.: " << backup_size_multiplier << "\n"; + out << " Debug output printing: " << enable_debug_output << "\n"; return out; } diff --git a/tests/common/tests/cca_test.hpp b/tests/common/tests/cca_test.hpp index 0499cd05c7..3c97737ab8 100644 --- a/tests/common/tests/cca_test.hpp +++ b/tests/common/tests/cca_test.hpp @@ -43,6 +43,7 @@ inline traccc::clustering_config default_ccl_test_config() { rv.max_cells_per_thread = 16; rv.target_cells_per_thread = 8; rv.backup_size_multiplier = 256; + rv.enable_debug_output = false; return rv; } @@ -54,6 +55,7 @@ inline traccc::clustering_config tiny_ccl_test_config() { rv.max_cells_per_thread = 1; rv.target_cells_per_thread = 1; rv.backup_size_multiplier = 16384; + rv.enable_debug_output = false; return rv; }