Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add throughput metrics for REDUCTION_BENCH/REDUCTION_NVBENCH benchmarks #16126

Merged
merged 7 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,13 @@ target_include_directories(

# Use an OBJECT library so we only compile these helper source files only once
add_library(
cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
synchronization/synchronization.cpp io/cuio_common.cpp
cudf_benchmark_common OBJECT
"${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
synchronization/synchronization.cpp
io/cuio_common.cpp
common/table_utilities.cpp
common/benchmark_utilities.cpp
common/nvbench_utilities.cpp
)
target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
add_custom_command(
Expand Down
27 changes: 27 additions & 0 deletions cpp/benchmarks/common/benchmark_utilities.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "benchmark_utilities.hpp"

void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration)
{
state.SetItemsProcessed(state.iterations() * items_processed_per_iteration);
}

void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration)
{
state.SetBytesProcessed(state.iterations() * bytes_processed_per_iteration);
}
41 changes: 41 additions & 0 deletions cpp/benchmarks/common/benchmark_utilities.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <benchmark/benchmark.h>

/**
* @brief Sets the number of items processed during the benchmark.
*
* This function could be used instead of ::benchmark::State.SetItemsProcessed()
* to avoid repeatedly computing ::benchmark::State.iterations() * items_processed_per_iteration.
*
* @param state the benchmark state
* @param items_processed_per_iteration number of items processed per iteration
*/
void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration);

/**
* @brief Sets the number of bytes processed during the benchmark.
*
* This function could be used instead of ::benchmark::State.SetItemsProcessed()
* to avoid repeatedly computing ::benchmark::State.iterations() * bytes_processed_per_iteration.
*
* @param state the benchmark state
* @param bytes_processed_per_iteration number of bytes processed per iteration
*/
void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration);
60 changes: 60 additions & 0 deletions cpp/benchmarks/common/nvbench_utilities.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "nvbench_utilities.hpp"

#include <nvbench/nvbench.cuh>

// This function is copied over from
// https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224.
void set_throughputs(nvbench::state& state)
{
double avg_cuda_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");

if (const auto items = state.get_element_count(); items != 0) {
auto& summ = state.add_summary("nv/cold/bw/item_rate");
summ.set_string("name", "Elem/s");
summ.set_string("hint", "item_rate");
summ.set_string("description", "Number of input elements processed per second");
summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
}

if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0) {
const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cuda_time;
{
auto& summ = state.add_summary("nv/cold/bw/global/bytes_per_second");
summ.set_string("name", "GlobalMem BW");
summ.set_string("hint", "byte_rate");
summ.set_string("description",
"Number of bytes read/written per second to the CUDA "
"device's global memory");
summ.set_float64("value", avg_used_gmem_bw);
}

{
const auto peak_gmem_bw =
static_cast<double>(state.get_device()->get_global_memory_bus_bandwidth());

auto& summ = state.add_summary("nv/cold/bw/global/utilization");
summ.set_string("name", "BWUtil");
summ.set_string("hint", "percentage");
summ.set_string("description",
"Global device memory utilization as a percentage of the "
"device's peak bandwidth");
summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw);
}
}
}
31 changes: 31 additions & 0 deletions cpp/benchmarks/common/nvbench_utilities.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

namespace nvbench {
struct state;
}

/**
* @brief Sets throughput statistics, such as "Elem/s", "GlobalMem BW", and "BWUtil" for the
* nvbench results summary.
*
* This function could be used to work around a known issue that the throughput statistics
* should be added before the nvbench::state.exec() call, otherwise they will not be printed
* in the summary. See https://github.com/NVIDIA/nvbench/issues/175 for more details.
*/
void set_throughputs(nvbench::state& state);
41 changes: 41 additions & 0 deletions cpp/benchmarks/common/table_utilities.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "table_utilities.hpp"

#include <cudf/reduction.hpp>
#include <cudf/transform.hpp>

#include <cmath>

int64_t estimate_size(cudf::column_view const& col)
{
return estimate_size(cudf::table_view({col}));
}

int64_t estimate_size(cudf::table_view const& view)
{
// Compute the size in bits for each row.
auto const row_sizes = cudf::row_bit_count(view);
// Accumulate the row sizes to compute a sum.
auto const agg = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
cudf::data_type sum_dtype{cudf::type_id::INT64};
auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype);
auto const total_size_in_bits =
static_cast<cudf::numeric_scalar<int64_t>*>(total_size_scalar.get())->value();
// Convert the size in bits to the size in bytes.
return static_cast<int64_t>(std::ceil(static_cast<double>(total_size_in_bits) / 8));
}
41 changes: 41 additions & 0 deletions cpp/benchmarks/common/table_utilities.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/table/table_view.hpp>

/**
* @brief Estimates the column size in bytes.
*
* @remark As this function internally uses cudf::row_bit_count() to estimate each row size
* and accumulates them, the returned estimate may be an inexact approximation in some
* cases. See cudf::row_bit_count() for more details.
*
* @param view The column view to estimate its size
*/
int64_t estimate_size(cudf::column_view const& view);

/**
* @brief Estimates the table size in bytes.
*
* @remark As this function internally uses cudf::row_bit_count() to estimate each row size
* and accumulates them, the returned estimate may be an inexact approximation in some
* cases. See cudf::row_bit_count() for more details.
*
* @param view The table view to estimate its size
*/
int64_t estimate_size(cudf::table_view const& view);
8 changes: 7 additions & 1 deletion cpp/benchmarks/reduction/anyall.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,7 +14,9 @@
* limitations under the License.
*/

#include <benchmarks/common/benchmark_utilities.hpp>
#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/table_utilities.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

Expand Down Expand Up @@ -42,6 +44,10 @@ void BM_reduction_anyall(benchmark::State& state,
cuda_event_timer timer(state, true);
auto result = cudf::reduce(*values, *agg, output_dtype);
}

// The benchmark takes a column and produces one scalar.
set_items_processed(state, column_size + 1);
set_bytes_processed(state, estimate_size(values->view()) + cudf::size_of(output_dtype));
}

#define concat(a, b, c) a##b##c
Expand Down
10 changes: 9 additions & 1 deletion cpp/benchmarks/reduction/dictionary.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021-2023, NVIDIA CORPORATION.
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,6 +14,7 @@
* limitations under the License.
*/

#include <benchmarks/common/benchmark_utilities.hpp>
#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>
Expand Down Expand Up @@ -52,6 +53,13 @@ void BM_reduction_dictionary(benchmark::State& state,
cuda_event_timer timer(state, true);
auto result = cudf::reduce(*values, *agg, output_dtype);
}

// The benchmark takes a column and produces two scalars.
set_items_processed(state, column_size + 1);

// We don't set the metrics for the size read/written as row_bit_count() doesn't
// support the dictionary type yet (and so is estimate_size()).
// See https://github.com/rapidsai/cudf/issues/16121 for details.
}

#define concat(a, b, c) a##b##c
Expand Down
13 changes: 10 additions & 3 deletions cpp/benchmarks/reduction/minmax.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,7 +14,9 @@
* limitations under the License.
*/

#include <benchmarks/common/benchmark_utilities.hpp>
#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/table_utilities.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

Expand All @@ -28,14 +30,19 @@ template <typename type>
void BM_reduction(benchmark::State& state)
{
cudf::size_type const column_size{(cudf::size_type)state.range(0)};
auto const dtype = cudf::type_to_id<type>();
auto const dtype_id = cudf::type_to_id<type>();
auto const input_column =
create_random_column(dtype, row_count{column_size}, data_profile_builder().no_validity());
create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity());

for (auto _ : state) {
cuda_event_timer timer(state, true);
auto result = cudf::minmax(*input_column);
}

// The benchmark takes a column and produces two scalars.
set_items_processed(state, column_size + 2);
cudf::data_type dtype = cudf::data_type{dtype_id};
set_bytes_processed(state, estimate_size(input_column->view()) + 2 * cudf::size_of(dtype));
}

#define concat(a, b, c) a##b##c
Expand Down
13 changes: 11 additions & 2 deletions cpp/benchmarks/reduction/rank.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,6 +15,8 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/nvbench_utilities.hpp>
#include <benchmarks/common/table_utilities.hpp>

#include <cudf/detail/scan.hpp>
#include <cudf/filling.hpp>
Expand All @@ -39,11 +41,18 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_list<typ
auto const new_tbl = cudf::repeat(table->view(), 2);
cudf::column_view input(new_tbl->view().column(0));

std::unique_ptr<cudf::column> result = nullptr;
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
auto result = cudf::detail::inclusive_dense_rank_scan(
result = cudf::detail::inclusive_dense_rank_scan(
input, stream_view, rmm::mr::get_current_device_resource());
});

state.add_element_count(input.size());
state.add_global_memory_reads(estimate_size(input));
state.add_global_memory_writes(estimate_size(result->view()));

set_throughputs(state);
}

using data_type = nvbench::type_list<int32_t, cudf::list_view>;
Expand Down
Loading
Loading