Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bytes_per_second to groupby max benchmark. #13984

Draft
wants to merge 1 commit into
base: branch-23.10
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ target_include_directories(

# Use an OBJECT library so we only compile these helper source files only once
add_library(
cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp"
synchronization/synchronization.cpp io/cuio_common.cpp
cudf_benchmark_common OBJECT
"${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp" synchronization/synchronization.cpp
io/cuio_common.cpp common/memory_statistics.cpp
)
target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
add_custom_command(
Expand Down
134 changes: 134 additions & 0 deletions cpp/benchmarks/common/memory_statistics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "memory_statistics.hpp"

#include <cudf/column/column.hpp>
#include <cudf/dictionary/dictionary_column_view.hpp>
#include <cudf/null_mask.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <numeric>

namespace {

/**
* @brief Calculate the payload size of a string column.
*/
inline uint64_t required_bytes_string(const cudf::column_view& column)
{
CUDF_EXPECTS(column.type().id() == cudf::type_id::STRING, "Input not a STRING column");
cudf::strings_column_view input(column);

uint64_t num_bytes = input.chars_size();
if (column.nullable()) { num_bytes += cudf::bitmask_allocation_size_bytes(column.size()); }
return num_bytes;
}

/**
* @brief Calculate the payload size of a struct column.
*/
inline uint64_t required_bytes_struct(const cudf::column_view& column)
{
CUDF_EXPECTS(column.type().id() == cudf::type_id::STRUCT, "Input not a STRUCT column");

uint64_t num_bytes =
std::accumulate(column.child_begin(), column.child_end(), 0, [](uint64_t acc, const auto& col) {
return acc + required_bytes(col);
});
if (column.nullable()) { num_bytes += cudf::bitmask_allocation_size_bytes(column.size()); }
return num_bytes;
}

/**
* @brief Calculate the payload size of a list column.
*/
inline uint64_t required_bytes_list(const cudf::column_view& column)
{
CUDF_EXPECTS(column.type().id() == cudf::type_id::LIST, "Input not a LIST column");

uint64_t num_bytes =
std::accumulate(column.child_begin(), column.child_end(), 0, [](uint64_t acc, const auto& col) {
return acc + required_bytes(col);
});
if (column.nullable()) { num_bytes += cudf::bitmask_allocation_size_bytes(column.size()); }
return num_bytes;
}

/**
* @brief Calculate the payload size of a dict column.
*/
inline uint64_t required_bytes_dict(const cudf::column_view& column)
{
CUDF_EXPECTS(column.type().id() == cudf::type_id::DICTIONARY32,
"Input not a DICTIONARY32 column");

cudf::dictionary_column_view input(column);
uint64_t num_bytes = required_bytes(input.keys());
num_bytes += required_bytes(input.indices());
if (column.nullable()) { num_bytes += cudf::bitmask_allocation_size_bytes(column.size()); }
return num_bytes;
}

/**
* @brief Calculate the payload size of a column containing fixed width types.
*/
inline uint64_t required_bytes_fixed_width_type(const cudf::column_view& column)
{
CUDF_EXPECTS(cudf::is_fixed_width(column.type()), "Invalid element type");

uint64_t num_bytes = column.size() * cudf::size_of(column.type());
if (column.nullable()) { num_bytes += cudf::bitmask_allocation_size_bytes(column.size()); }
return num_bytes;
}

} // namespace

uint64_t required_bytes(const cudf::column_view& column)
{
uint64_t num_bytes = 0;

switch (column.type().id()) {
case cudf::type_id::STRING: num_bytes = required_bytes_string(column); break;
case cudf::type_id::STRUCT: num_bytes = required_bytes_struct(column); break;
case cudf::type_id::LIST: num_bytes = required_bytes_list(column); break;
case cudf::type_id::DICTIONARY32: num_bytes = required_bytes_dict(column); break;
default: num_bytes = required_bytes_fixed_width_type(column);
}

return num_bytes;
}

uint64_t required_bytes(const cudf::table_view& table)
{
return std::accumulate(table.begin(), table.end(), 0, [](uint64_t acc, const auto& col) {
return acc + required_bytes(col);
});
}

uint64_t required_bytes(
const cudf::host_span<cudf::groupby::aggregation_result>& aggregation_results)
{
uint64_t read_bytes = 0;

for (auto const& aggregation : aggregation_results) { // vector of aggregation results
for (auto const& col : aggregation.results) { // vector of columns per result
read_bytes += required_bytes(col->view());
}
}

return read_bytes;
}
57 changes: 57 additions & 0 deletions cpp/benchmarks/common/memory_statistics.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/column/column_view.hpp>
#include <cudf/groupby.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/span.hpp>

/**
* @brief Calculate the number of bytes needed to completely read/write the provided column.
*
* The functions computes only the size of the payload of the column in bytes, it excludes
* any metadata.
*
* @param column View of the input column
* @returns Number of bytes needed to read or write the column.
*/
uint64_t required_bytes(const cudf::column_view& column);

/**
* @brief Calculate the number of bytes needed to completely read/write the provided table.
*
* The functions computes only the size of the payload of the table in bytes, it excludes
* any metadata.
*
* @param table View of the input table.
* @returns Number of bytes needed to read or write the table.
*/
uint64_t required_bytes(const cudf::table_view& table);

/**
* @brief Calculate the number of bytes needed to completely read/write the provided sequence of
* aggregation results.
*
* The functions computes only the size of the payload of the aggregation results in bytes, it
* excludes any metadata.
*
* @param aggregation_results Sequence of aggregation results from groupby execution.
* @returns Number of bytes needed to read or write the aggregation results.
*/
uint64_t required_bytes(
const cudf::host_span<cudf::groupby::aggregation_result>& aggregation_results);
27 changes: 21 additions & 6 deletions cpp/benchmarks/groupby/group_max.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/memory_statistics.hpp>

#include <cudf/groupby.hpp>

#include <nvbench/nvbench.cuh>

#include <optional>

template <typename Type>
void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
{
Expand All @@ -32,24 +35,36 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
}();

auto const vals = [&] {
auto builder = data_profile_builder().cardinality(0).distribution(
data_profile profile = data_profile_builder().cardinality(0).distribution(
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 1000);
if (const auto null_freq = state.get_float64("null_probability"); null_freq > 0) {
builder.null_probability(null_freq);
profile.set_null_probability(null_freq);
} else {
builder.no_validity();
profile.set_null_probability(std::nullopt);
}
return create_random_column(cudf::type_to_id<Type>(), row_count{size}, data_profile{builder});
return create_random_column(cudf::type_to_id<Type>(), row_count{size}, profile);
}();

auto keys_view = keys->view();
auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
auto const keys_view = keys->view();
auto const keys_table = cudf::table_view({keys_view, keys_view, keys_view});
auto gb_obj = cudf::groupby::groupby(keys_table);

std::vector<cudf::groupby::aggregation_request> requests;
requests.emplace_back(cudf::groupby::aggregation_request());
requests[0].values = vals->view();
requests[0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());

// Add memory statistics
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(vals->view()));
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(keys_table));

// The number of written bytes depends on random distribution of keys.
// For larger sizes it converges against the number of unique elements
// in the input distribution (101 elements)
auto [res_table, res_agg] = gb_obj.aggregate(requests);
state.add_global_memory_writes<uint8_t>(required_bytes(res_table->view()));
state.add_global_memory_writes<uint8_t>(required_bytes(res_agg));

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
Expand Down
18 changes: 15 additions & 3 deletions cpp/benchmarks/groupby/group_nunique.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/memory_statistics.hpp>

#include <cudf/groupby.hpp>

Expand Down Expand Up @@ -58,11 +59,22 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
return create_random_column(cudf::type_to_id<Type>(), row_count{size}, profile);
}();

auto gb_obj =
cudf::groupby::groupby(cudf::table_view({keys->view(), keys->view(), keys->view()}));
auto const requests = make_aggregation_request_vector(
auto const keys_table = cudf::table_view({keys->view(), keys->view(), keys->view()});
auto gb_obj = cudf::groupby::groupby(keys_table);
auto const requests = make_aggregation_request_vector(
*vals, cudf::make_nunique_aggregation<cudf::groupby_aggregation>());

// Add memory statistics
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(vals->view()));
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(keys_table));

// The number of written bytes depends on random distribution of keys.
// For larger sizes it converges against the number of unique elements
// in the input distribution (101 elements)
auto [res_table, res_agg] = gb_obj.aggregate(requests);
state.add_global_memory_writes<uint8_t>(required_bytes(res_table->view()));
state.add_global_memory_writes<uint8_t>(required_bytes(res_agg));

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
Expand Down
12 changes: 12 additions & 0 deletions cpp/benchmarks/groupby/group_rank.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/
#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/memory_statistics.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <cudf/groupby.hpp>
Expand Down Expand Up @@ -53,6 +54,17 @@ static void nvbench_groupby_rank(nvbench::state& state,
cudf::groupby::groupby gb_obj(
keys, cudf::null_policy::EXCLUDE, is_sorted ? cudf::sorted::YES : cudf::sorted::NO);

// Add memory statistics
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(order_by));
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(keys));

// The number of written bytes depends on random distribution of keys.
// For larger sizes it converges against the number of unique elements
// in the input distribution (101 elements)
auto [res_table, res_agg] = gb_obj.scan(requests);
state.add_global_memory_writes<uint8_t>(required_bytes(res_table->view()));
state.add_global_memory_writes<uint8_t>(required_bytes(res_agg));

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
rmm::cuda_stream_view stream_view{launch.get_stream()};
// groupby scan uses sort implementation
Expand Down
12 changes: 12 additions & 0 deletions cpp/benchmarks/groupby/group_struct_keys.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/common/memory_statistics.hpp>

#include <cudf_test/column_wrapper.hpp>

Expand Down Expand Up @@ -83,6 +84,17 @@ void bench_groupby_struct_keys(nvbench::state& state)
auto stream = cudf::get_default_stream();
state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));

// Add memory statistics
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(keys_table.view()));
state.add_global_memory_reads<nvbench::uint8_t>(required_bytes(vals->view()));

// The number of written bytes depends on random distribution of keys.
// For larger sizes it converges against the number of unique elements
// in the input distribution (101 elements)
auto [res_table, res_agg] = gb_obj.aggregate(requests);
state.add_global_memory_writes<uint8_t>(required_bytes(res_table->view()));
state.add_global_memory_writes<uint8_t>(required_bytes(res_agg));

state.exec(nvbench::exec_tag::sync,
[&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
}
Expand Down