Skip to content

Commit

Permalink
Add debug output to clustering algorithms
Browse files Browse the repository at this point in the history
In #595, I equipped the CCA code with some edge case handling which
allows it to handle oversized partitions. Although this makes sure the
algorithm works, it also risks to slow down execution. In order to
better understand how much performance we might be losing, this commit
adds the ability for the SYCL and CUDA algorithms to print some warnings
if they ever encounter this edge case.
  • Loading branch information
stephenswat committed Aug 1, 2024
1 parent 532d885 commit 23b3266
Show file tree
Hide file tree
Showing 10 changed files with 171 additions and 21 deletions.
7 changes: 7 additions & 0 deletions core/include/traccc/clusterization/clustering_config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ struct clustering_config {
*/
unsigned int backup_size_multiplier;

/**
* @brief Flag to enforce debug output.
*
* @warning This will slown down the clustering algorithm.
*/
bool enable_debug_output;

/**
* @brief The maximum number of cells per partition.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ struct CCLKernel {
partition_start, partition_end, outi, f_view,
gf_view, f_backup_view, gf_backup_view,
adjc_backup_view, adjv_backup_view, backup_mutex,
barry_r, measurements_view, cell_links);
barry_r, measurements_view, cell_links, nullptr);
}
};

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/**
* traccc library, part of the ACTS project (R&D line)
*
* (c) 2024 CERN for the benefit of the ACTS project
*
* Mozilla Public License Version 2.0
*/

#pragma once

#include <cstdint>

namespace traccc::device::details {
struct ccl_debug_output {
uint32_t num_oversized_partitions;

static ccl_debug_output init() {
ccl_debug_output rv;

rv.num_oversized_partitions = 0;

return rv;
}
};
} // namespace traccc::device::details
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

// Project include(s).
#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/device/ccl_debug_output.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/definitions/hints.hpp"
#include "traccc/definitions/qualifiers.hpp"
Expand Down Expand Up @@ -53,6 +54,7 @@ namespace traccc::device {
/// @param[out] measurements_view collection of measurements
/// @param[out] cell_links collection of links to measurements each cell is
/// put into
/// @param[out] debug_output debug output location
template <device::concepts::barrier barrier_t,
device::concepts::thread_id1 thread_id_t>
TRACCC_DEVICE inline void ccl_kernel(
Expand All @@ -68,7 +70,8 @@ TRACCC_DEVICE inline void ccl_kernel(
vecmem::data::vector_view<details::index_t> adjv_backup_view,
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
measurement_collection_types::view measurements_view,
vecmem::data::vector_view<unsigned int> cell_links);
vecmem::data::vector_view<unsigned int> cell_links,
details::ccl_debug_output* debug_output);

} // namespace traccc::device

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/device/aggregate_cluster.hpp"
#include "traccc/clusterization/device/ccl_debug_output.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/clusterization/device/reduce_problem_cell.hpp"
#include "traccc/device/concepts/barrier.hpp"
Expand Down Expand Up @@ -220,7 +221,8 @@ TRACCC_DEVICE inline void ccl_kernel(
vecmem::data::vector_view<details::index_t> adjv_backup_view,
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
measurement_collection_types::view measurements_view,
vecmem::data::vector_view<unsigned int> cell_links) {
vecmem::data::vector_view<unsigned int> cell_links,
details::ccl_debug_output* debug_output) {
// Construct device containers around the views.
const cell_collection_types::const_device cells_device(cells_view);
const cell_module_collection_types::const_device modules_device(
Expand Down Expand Up @@ -325,6 +327,13 @@ TRACCC_DEVICE inline void ccl_kernel(
if (size > cfg.max_partition_size()) {
if (thread_id.getLocalThreadIdX() == 0) {
lock.lock();

if (debug_output) {
vecmem::device_atomic_ref<uint32_t>
num_oversized_partitions_atm(
debug_output->num_oversized_partitions);
num_oversized_partitions_atm.fetch_add(1);
}
}

barrier.blockBarrier();
Expand Down
56 changes: 51 additions & 5 deletions device/cuda/src/clusterization/clusterization_algorithm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,16 @@
*/

// CUDA Library include(s).
#include <cuda_runtime_api.h>
#include <driver_types.h>

#include "../sanity/contiguous_on.cuh"
#include "../sanity/ordered_on.cuh"
#include "../utils/barrier.hpp"
#include "../utils/cuda_error_handling.hpp"
#include "../utils/utils.hpp"
#include "traccc/clusterization/clustering_config.hpp"
#include "traccc/clusterization/device/ccl_debug_output.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/cuda/clusterization/clusterization_algorithm.hpp"
#include "traccc/cuda/utils/thread_id.hpp"
Expand All @@ -21,6 +25,9 @@
// Project include(s)
#include "traccc/clusterization/device/ccl_kernel.hpp"

// System include
#include <iostream>

// Vecmem include(s).
#include <cstring>
#include <vecmem/utils/copy.hpp>
Expand All @@ -40,7 +47,8 @@ __global__ void ccl_kernel(
vecmem::data::vector_view<device::details::index_t> gf_backup_view,
vecmem::data::vector_view<unsigned char> adjc_backup_view,
vecmem::data::vector_view<device::details::index_t> adjv_backup_view,
unsigned int* backup_mutex_ptr) {
unsigned int* backup_mutex_ptr,
device::details::ccl_debug_output* debug_output) {

__shared__ std::size_t partition_start, partition_end;
__shared__ std::size_t outi;
Expand All @@ -62,7 +70,7 @@ __global__ void ccl_kernel(
partition_start, partition_end, outi, f_view, gf_view,
f_backup_view, gf_backup_view, adjc_backup_view,
adjv_backup_view, backup_mutex, barry_r,
measurements_view, cell_links);
measurements_view, cell_links, debug_output);
}

} // namespace kernels
Expand Down Expand Up @@ -132,14 +140,52 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
assert(m_config.max_cells_per_thread <=
device::details::CELLS_PER_THREAD_STACK_LIMIT);

// If necessary, allocate an object for storing the debug information
vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;

if (m_config.enable_debug_output) {
debug_output =
vecmem::make_unique_alloc<device::details::ccl_debug_output>(
m_mr.main);

device::details::ccl_debug_output empty_output =
device::details::ccl_debug_output::init();

TRACCC_CUDA_ERROR_CHECK(
cudaMemcpyAsync(debug_output.get(), &empty_output,
sizeof(device::details::ccl_debug_output),
cudaMemcpyHostToDevice, stream));
}

kernels::ccl_kernel<<<num_blocks, m_config.threads_per_partition,
2 * m_config.max_partition_size() *
sizeof(device::details::index_t),
stream>>>(
m_config, cells, modules, measurements, cell_links, m_f_backup,
m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get());
stream>>>(m_config, cells, modules, measurements,
cell_links, m_f_backup, m_gf_backup,
m_adjc_backup, m_adjv_backup,
m_backup_mutex.get(), debug_output.get());
TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());

if (debug_output) {
device::details::ccl_debug_output host_output;

TRACCC_CUDA_ERROR_CHECK(
cudaMemcpyAsync(&host_output, debug_output.get(),
sizeof(device::details::ccl_debug_output),
cudaMemcpyDeviceToHost, stream));

TRACCC_CUDA_ERROR_CHECK(cudaStreamSynchronize(stream));

if (host_output.num_oversized_partitions > 0) {
std::cout << "WARNING: @clusterization_algorithm: "
<< "Clustering encountered "
<< host_output.num_oversized_partitions
<< " oversized partitions; if this number is too large, "
"it may cause performance problems."
<< std::endl;
}
}

// Return the reconstructed measurements.
return measurements;
}
Expand Down
67 changes: 55 additions & 12 deletions device/sycl/src/clusterization/clusterization_algorithm.sycl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "../sanity/ordered_on.hpp"
#include "../utils/barrier.hpp"
#include "../utils/get_queue.hpp"
#include "traccc/clusterization/device/ccl_debug_output.hpp"
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
#include "traccc/sycl/clusterization/clusterization_algorithm.hpp"
#include "traccc/sycl/utils/thread_id.hpp"
Expand Down Expand Up @@ -113,9 +114,28 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
assert(m_config.max_cells_per_thread <=
device::details::CELLS_PER_THREAD_STACK_LIMIT);

// If necessary, allocate an object for storing the debug information
vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;
cl::sycl::event evt_copy_debug_output_h2d;

if (m_config.enable_debug_output) {
debug_output =
vecmem::make_unique_alloc<device::details::ccl_debug_output>(
m_mr.main);

device::details::ccl_debug_output empty_output =
device::details::ccl_debug_output::init();

evt_copy_debug_output_h2d = details::get_queue(m_queue).memcpy(
debug_output.get(), &empty_output,
sizeof(device::details::ccl_debug_output));
}

// Run ccl kernel
details::get_queue(m_queue)
.submit([&](::sycl::handler& h) {
cl::sycl::event evt_run_kernel =
details::get_queue(m_queue).submit([&](::sycl::handler& h) {
h.depends_on(evt_copy_debug_output_h2d);

// Allocate shared memory for the kernel.
vecmem::sycl::local_accessor<std::size_t> shared_uint(3, h);
vecmem::sycl::local_accessor<device::details::index_t> shared_idx(
Expand All @@ -130,8 +150,8 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
gf_backup_view = vecmem::get_data(m_gf_backup),
adjc_backup_view = vecmem::get_data(m_adjc_backup),
adjv_backup_view = vecmem::get_data(m_adjv_backup),
mutex_ptr = m_backup_mutex.get(),
cfg = m_config](::sycl::nd_item<1> item) {
mutex_ptr = m_backup_mutex.get(), cfg = m_config,
debug_output = debug_output.get()](::sycl::nd_item<1> item) {
// Construct more readable variable names.
vecmem::data::vector_view<device::details::index_t> f_view{
static_cast<vector_size_t>(cfg.max_partition_size()),
Expand All @@ -152,15 +172,38 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
const sycl::thread_id1 thread_id(item);

// Run the algorithm for this thread.
device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
partition_start, partition_end, outi,
f_view, gf_view, f_backup_view,
gf_backup_view, adjc_backup_view,
adjv_backup_view, backup_mutex, barry_r,
measurements_view, cell_links_view);
device::ccl_kernel(
cfg, thread_id, cells_view, modules_view,
partition_start, partition_end, outi, f_view, gf_view,
f_backup_view, gf_backup_view, adjc_backup_view,
adjv_backup_view, backup_mutex, barry_r,
measurements_view, cell_links_view, debug_output);
});
})
.wait_and_throw();
});

cl::sycl::event evt_copy_debug_output_d2h;

if (debug_output) {
device::details::ccl_debug_output host_output;

evt_copy_debug_output_d2h = details::get_queue(m_queue).memcpy(
&host_output, debug_output.get(),
sizeof(device::details::ccl_debug_output), {evt_run_kernel});

evt_copy_debug_output_d2h.wait_and_throw();

if (host_output.num_oversized_partitions > 0) {
std::cout << "WARNING: @clusterization_algorithm: "
<< "Clustering encountered "
<< host_output.num_oversized_partitions
<< " oversized partitions; if this number is too large, "
"it may cause performance problems."
<< std::endl;
}
}

cl::sycl::event::wait_and_throw(
{evt_run_kernel, evt_copy_debug_output_d2h});

// Return the reconstructed measurements.
return measurements;
Expand Down
1 change: 1 addition & 0 deletions examples/options/include/traccc/options/clusterization.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class clusterization
unsigned int max_cells_per_thread;
unsigned int target_cells_per_thread;
unsigned int backup_size_multiplier;
bool enable_debug_output;
/// @}

/// Print the specific options of this class
Expand Down
16 changes: 15 additions & 1 deletion examples/options/src/clusterization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@
// System include(s).
#include <iostream>

namespace {
#ifndef NDEBUG
constexpr bool enable_cca_debug_default = true;
#else
constexpr bool enable_cca_debug_default = false;
#endif
} // namespace

namespace traccc::opts {

clusterization::clusterization() : interface("Clusterization Options") {
Expand All @@ -33,6 +41,10 @@ clusterization::clusterization() : interface("Clusterization Options") {
boost::program_options::value(&backup_size_multiplier)
->default_value(256),
"The size multiplier of the backup scratch space");
m_desc.add_options()("cca-debug",
boost::program_options::value(&enable_debug_output)
->default_value(enable_cca_debug_default),
"The size multiplier of the backup scratch space");
}

clusterization::operator clustering_config() const {
Expand All @@ -42,6 +54,7 @@ clusterization::operator clustering_config() const {
rv.max_cells_per_thread = max_cells_per_thread;
rv.target_cells_per_thread = target_cells_per_thread;
rv.backup_size_multiplier = backup_size_multiplier;
rv.enable_debug_output = enable_debug_output;

return rv;
}
Expand All @@ -54,7 +67,8 @@ std::ostream& clusterization::print_impl(std::ostream& out) const {
out << " Threads per partition: " << threads_per_partition << "\n";
out << " Target cells per thread: " << target_cells_per_thread << "\n";
out << " Max cells per thread: " << max_cells_per_thread << "\n";
out << " Scratch space size mult.: " << backup_size_multiplier;
out << " Scratch space size mult.: " << backup_size_multiplier << "\n";
out << " Debug output printing: " << enable_debug_output << "\n";
return out;
}

Expand Down
2 changes: 2 additions & 0 deletions tests/common/tests/cca_test.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ inline traccc::clustering_config default_ccl_test_config() {
rv.max_cells_per_thread = 16;
rv.target_cells_per_thread = 8;
rv.backup_size_multiplier = 256;
rv.enable_debug_output = false;

return rv;
}
Expand All @@ -54,6 +55,7 @@ inline traccc::clustering_config tiny_ccl_test_config() {
rv.max_cells_per_thread = 1;
rv.target_cells_per_thread = 1;
rv.backup_size_multiplier = 16384;
rv.enable_debug_output = false;

return rv;
}
Expand Down

0 comments on commit 23b3266

Please sign in to comment.