Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-34785: [C++][Parquet] Parquet Bloom Filter Writer Implementation #37400

Open
wants to merge 62 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
f1c6dc0
Parquet: Implement skeleton for BloomFilter
mapleFU Aug 26, 2023
6ebd6da
tiny fixing
mapleFU Aug 26, 2023
70c9267
tiny update test
mapleFU Aug 26, 2023
48350d8
trying to fix ci
mapleFU Aug 26, 2023
d2a659e
fix lint
mapleFU Aug 26, 2023
41236d8
fix some style problem
mapleFU Aug 26, 2023
8afba81
add file roundtrip test
mapleFU Aug 26, 2023
96c6691
add file roundtrip test
mapleFU Aug 26, 2023
c131341
fix document and ci
mapleFU Aug 26, 2023
220b58e
Update: tiny style fix
mapleFU Aug 26, 2023
ad96c48
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Sep 2, 2023
b756241
Bloom Filter Resolve comments:
mapleFU Sep 2, 2023
f43505b
make space writing a batched writing
mapleFU Sep 2, 2023
3497f4a
update bloom_filter builder interface
mapleFU Sep 2, 2023
fecd0f0
update BloomFilterBuilder arguments
mapleFU Sep 2, 2023
29cc1c1
fix compile
mapleFU Sep 2, 2023
ffbb491
try to satisfy win compiler
mapleFU Sep 2, 2023
4d63428
change all to vector
mapleFU Sep 2, 2023
f689716
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Sep 11, 2023
8e9cb16
resolve comment
mapleFU Sep 11, 2023
7fd47be
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Oct 2, 2023
7c4ff4e
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Oct 10, 2023
feccee9
fix some comment
mapleFU Oct 10, 2023
90245e7
add cached version test
mapleFU Oct 10, 2023
d924e36
cleaning the code for column-props
mapleFU Oct 10, 2023
0340193
optimize get bf
mapleFU Oct 10, 2023
b78eed0
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Mar 16, 2024
23828e1
comment minor fix
mapleFU Mar 16, 2024
6fd57dc
fix comment and add bloom-filter-length
mapleFU Mar 16, 2024
86a8760
Fix a bf bug
mapleFU Mar 16, 2024
f8e724c
trying to use std::map for RowGroup filter
mapleFU Mar 17, 2024
447badf
trying to fix msvc compile
mapleFU Mar 17, 2024
0c1065c
fix comment
mapleFU Mar 17, 2024
5225e08
add test case for 2 row-groups
mapleFU Mar 17, 2024
a779982
add test case for dictionary
mapleFU Mar 17, 2024
4195406
minor update style for file_writer.cc
mapleFU Mar 17, 2024
ed267bd
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Mar 26, 2024
478889d
resolve comment
mapleFU Mar 26, 2024
2992072
fix comment for boolean col, and add test
mapleFU Mar 26, 2024
4852261
trying to add bloom boolean test
mapleFU Mar 26, 2024
add1afd
fix test
mapleFU Mar 26, 2024
f627e30
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Apr 8, 2024
bb8d4a5
fix some comments
mapleFU Apr 8, 2024
ad0f1af
Merge branch 'parquet/support-write-bloom-filter' of github.com:maple…
mapleFU Apr 8, 2024
e1de5bc
fix lint
mapleFU Apr 8, 2024
430742a
switch to anonymous namespace
mapleFU Apr 9, 2024
00f176e
fix comment for column_writer.cc
mapleFU Apr 26, 2024
17f4951
fix comment in other parts
mapleFU Apr 26, 2024
de27ce4
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Apr 26, 2024
259f15b
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Apr 26, 2024
057b542
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Jun 10, 2024
34a4c28
trying to fix the ci build
mapleFU Jun 10, 2024
70e3508
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Jul 3, 2024
c587568
resolve comments
mapleFU Jul 3, 2024
2223423
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Nov 11, 2024
22030db
change the bloom filter from vector to map
mapleFU Nov 11, 2024
e9c550a
fix lint
mapleFU Nov 11, 2024
23fb3fa
fix lint
mapleFU Nov 14, 2024
d892819
fix comment
mapleFU Nov 15, 2024
ef3291d
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Dec 20, 2024
7aee7dd
Merge branch 'main' into parquet/support-write-bloom-filter
mapleFU Jan 13, 2025
c5b1fb1
Resolve comments
mapleFU Jan 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ set(PARQUET_SRCS
arrow/writer.cc
bloom_filter.cc
bloom_filter_reader.cc
bloom_filter_builder.cc
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
column_reader.cc
column_scanner.cc
column_writer.cc
Expand Down Expand Up @@ -335,7 +336,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/parquet_version.h"
add_parquet_test(internals-test
SOURCES
bloom_filter_test.cc
bloom_filter_reader_test.cc
bloom_filter_parquet_test.cc
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
properties_test.cc
statistics_test.cc
encoding_test.cc
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/parquet/bloom_filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,12 @@ class PARQUET_EXPORT BloomFilter {

virtual ~BloomFilter() = default;

// Variant of const pointer argument to facilitate template
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
uint64_t Hash(const int32_t* value) const { return Hash(*value); }
uint64_t Hash(const int64_t* value) const { return Hash(*value); }
uint64_t Hash(const float* value) const { return Hash(*value); }
uint64_t Hash(const double* value) const { return Hash(*value); }

protected:
// Hash strategy available for Bloom filter.
enum class HashStrategy : uint32_t { XXHASH = 0 };
Expand Down
117 changes: 117 additions & 0 deletions cpp/src/parquet/bloom_filter_builder.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#include "parquet/bloom_filter_builder.h"
mapleFU marked this conversation as resolved.
Show resolved Hide resolved

#include <utility>

#include "arrow/io/interfaces.h"

#include "metadata.h"
#include "parquet/bloom_filter.h"
#include "parquet/exception.h"
#include "parquet/properties.h"

namespace parquet {

class BloomFilterBuilderImpl : public BloomFilterBuilder {
public:
explicit BloomFilterBuilderImpl(const SchemaDescriptor* schema,
WriterProperties properties)
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the point of making a copy here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Anyway underlying builder doesn't hold a reference here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The builder cannot outlive the FileWriter, so why not simply follow other places like this https://github.com/search?q=repo%3Aapache%2Farrow+%22const+WriterProperties*%22&type=code

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

: schema_(schema), properties_(std::move(properties)) {}
/// Append a new row group to host all incoming bloom filters.
void AppendRowGroup() override;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't actually appending a new row-group just marking that a row-group is starting so filters should be reset?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Parquet uses row-group level bloom filter, so this just setup a new row-group for filters


BloomFilter* GetOrCreateBloomFilter(
int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) override;

/// Serialize all bloom filters with header and bitset in the order of row group and
/// column id. Column encryption is not implemented yet. The side effect is that it
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Column encryption is not implemented yet

What about putting this to the beginning of class BloomFilterBuilder?

/// deletes all bloom filters after they have been flushed.
void WriteTo(::arrow::io::OutputStream* sink,
BloomFilterLocation* location) override;

void Finish() override { finished_ = true; }

private:
/// Make sure column ordinal is not out of bound and the builder is in good state.
void CheckState(int32_t column_ordinal) const {
if (finished_) {
throw ParquetException("PageIndexBuilder is already finished.");
}
if (column_ordinal < 0 || column_ordinal >= schema_->num_columns()) {
throw ParquetException("Invalid column ordinal: ", column_ordinal);
}
if (row_group_bloom_filters_.empty()) {
throw ParquetException("No row group appended to PageIndexBuilder.");
}
}

const SchemaDescriptor* schema_;
WriterProperties properties_;
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
bool finished_ = false;

// vector: row_group_ordinal
// map: column_ordinal -> bloom filter
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
std::vector<std::map<int32_t, std::unique_ptr<BloomFilter>>> row_group_bloom_filters_;
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
};

std::unique_ptr<BloomFilterBuilder> BloomFilterBuilder::Make(
const SchemaDescriptor* schema, const WriterProperties& properties) {
return std::unique_ptr<BloomFilterBuilder>(
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
new BloomFilterBuilderImpl(schema, properties));
}

void BloomFilterBuilderImpl::AppendRowGroup() { row_group_bloom_filters_.emplace_back(); }

BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(
int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) {
CheckState(column_ordinal);
std::unique_ptr<BloomFilter>& bloom_filter =
row_group_bloom_filters_.back()[column_ordinal];
if (bloom_filter == nullptr) {
auto block_split_bloom_filter =
std::make_unique<BlockSplitBloomFilter>(properties_.memory_pool());
block_split_bloom_filter->Init(BlockSplitBloomFilter::OptimalNumOfBytes(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was a recent discussion on the parquet mailing list about bloom filters and what good writers should do. My take-away was:

  1. Knowing NDV up-front typically requires two passes which I don't think we are doing. It might be better to take FPP and a byte size, and work out ndv if necessary.
  2. An extension of this idea (I think someone tried in Java, maybe it was you?) is to have multiple byte sizes (e.g. at log_2 intervals with an FPP) write to all of them and then choose the smallest one that is reasonably sparse.
  3. In either case it would likely be a good idea to evaluate the final bloom filters for sparcity before choosing to write them (this might be another config parameter).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think someone tried in Java, maybe it was you?

Personally I think the best way is to buffering the hash values and making a decision later when hash value too much or buffer is too large. But personally I think we can first make a "static" config and enhance it later

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have reviewed that PR and it could be a followup change. Writer implementation has the freedom to try smart things.

FYI, parquet-java also discards the bloom filter if dictionary encoding is applied to all data pages, though I don't think we should do the same thing.

bloom_filter_options.ndv, bloom_filter_options.fpp));
bloom_filter = std::move(block_split_bloom_filter);
}
return bloom_filter.get();
}

void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink,
BloomFilterLocation* location) {
if (!finished_) {
throw ParquetException("Cannot call WriteTo() to unfinished PageIndexBuilder.");
}
if (row_group_bloom_filters_.empty()) {
// Return quickly if there is no bloom filter
return;
}

for (size_t row_group_ordinal = 0; row_group_ordinal < row_group_bloom_filters_.size();
++row_group_ordinal) {
const auto& row_group_bloom_filters = row_group_bloom_filters_[row_group_ordinal];
// the whole row group has no bloom filter
if (row_group_bloom_filters.empty()) {
continue;
}
bool has_valid_bloom_filter = false;
int num_columns = schema_->num_columns();
std::vector<std::optional<IndexLocation>> locations(num_columns, std::nullopt);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it maybe better to make this a map. I expect the number of columns with a bloom filter to be relatively small compared to the number of overall columns?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 This reuse some structure in PageIndex, however, I think IndexLocaction is just 9b, and even for parquet file with 10000 columns, the cost here is low(about 200kib) , so I think we can keep vector here?


// serialize bloom filter by ascending order of column id
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
for (int32_t column_id = 0; column_id < num_columns; ++column_id) {
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
auto iter = row_group_bloom_filters.find(column_id);
if (iter != row_group_bloom_filters.cend() && iter->second != nullptr) {
PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell());
iter->second->WriteTo(sink);
PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell());
has_valid_bloom_filter = true;
locations[column_id] = IndexLocation{offset, static_cast<int32_t>(pos - offset)};
}
}
if (has_valid_bloom_filter) {
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
location->bloom_filter_location.emplace(row_group_ordinal, std::move(locations));
}
}
}

} // namespace parquet
69 changes: 69 additions & 0 deletions cpp/src/parquet/bloom_filter_builder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// This module defines an abstract interface for iterating through pages in a
// Parquet column chunk within a row group. It could be extended in the future
// to iterate through all data pages in all chunks in a file.
mapleFU marked this conversation as resolved.
Show resolved Hide resolved

#pragma once

#include "arrow/io/interfaces.h"
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
#include "parquet/types.h"

namespace parquet {

class BloomFilter;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move these forward declarations to parquet/type_fwd.h?

class SchemaDescriptor;
struct BloomFilterOptions;
struct BloomFilterLocation;

namespace schema {
class ColumnPath;
}
mapleFU marked this conversation as resolved.
Show resolved Hide resolved

/// \brief Interface for collecting bloom filter of a parquet file.
class PARQUET_EXPORT BloomFilterBuilder {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be BloomFilterWriter instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO, BloomFilterBuilder is better because the class BloomFilter is more like a BloomFilterWriter

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other name is also LGTM, the BloomFilterBuilder seems like "container for Bloom Filter Writer in each columns"

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already have a lot of Builders, so BloomFilterBuilder looks fine to me.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, do we really need a separate file for the new builder? If yes, should we make it an internal header? My rationale is that this kind of builder is used only internally by the writer and users do not have to deal with them at all. However, we have already exposed builders like FileMetaDataBuilder, RowGroupMetaDataBuilder, ColumnChunkMetaDataBuilder, and PageIndexBuilder (unfortunately it was added by myself).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, this could be a internal class

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I use a anonymous namespace here

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is the anonymous namespace?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently I changed to internal, since annoymous namespace for a parquet used structure is a bit hacking 🤔

public:
/// \brief API convenience to create a BloomFilterBuilder.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that this static factory? I don't think users can create one without calling this method?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. It's a static factory

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing it to /// \brief API to create a BloomFilterBuilder.

static std::unique_ptr<BloomFilterBuilder> Make(const SchemaDescriptor* schema,
const WriterProperties& properties);

/// Append a new row group to host all incoming bloom filters.
virtual void AppendRowGroup() = 0;

/// \brief Get the BloomFilter from column ordinal.
///
/// \param column_ordinal Column ordinal in schema, which is only for leaf columns.
/// \return ColumnIndexBuilder for the column and its memory ownership belongs to
/// the PageIndexBuilder.
virtual BloomFilter* GetOrCreateBloomFilter(
int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) = 0;

/// \brief Write the bloom filter to sink.
///
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
/// \param[out] sink The output stream to write the bloom filter.
/// \param[out] location The location of all page index to the start of sink.
virtual void WriteTo(::arrow::io::OutputStream* sink,
BloomFilterLocation* location) = 0;

/// \brief Complete the bloom filter builder and no more write is allowed.
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
virtual void Finish() = 0;

virtual ~BloomFilterBuilder() = default;
};

} // namespace parquet
141 changes: 141 additions & 0 deletions cpp/src/parquet/bloom_filter_parquet_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <arrow/testing/gtest_util.h>
#include <gtest/gtest.h>

#include "parquet/bloom_filter.h"
#include "parquet/bloom_filter_builder.h"
#include "parquet/bloom_filter_reader.h"
#include "parquet/file_reader.h"
#include "parquet/test_util.h"

namespace parquet::test {

TEST(BloomFilterReader, ReadBloomFilter) {
std::string dir_string(parquet::test::get_data_dir());
std::string path = dir_string + "/data_index_bloom_encoding_stats.parquet";
auto reader = ParquetFileReader::OpenFile(path, false);
auto file_metadata = reader->metadata();
EXPECT_FALSE(file_metadata->is_encryption_algorithm_set());
auto& bloom_filter_reader = reader->GetBloomFilterReader();
auto row_group_0 = bloom_filter_reader.RowGroup(0);
ASSERT_NE(nullptr, row_group_0);
EXPECT_THROW(bloom_filter_reader.RowGroup(1), ParquetException);
auto bloom_filter = row_group_0->GetColumnBloomFilter(0);
ASSERT_NE(nullptr, bloom_filter);
EXPECT_THROW(row_group_0->GetColumnBloomFilter(1), ParquetException);

// assert exists
{
std::string_view sv = "Hello";
ByteArray ba{sv};
EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(&ba)));
}

// no exists
{
std::string_view sv = "NOT_EXISTS";
ByteArray ba{sv};
EXPECT_FALSE(bloom_filter->FindHash(bloom_filter->Hash(&ba)));
}
}

TEST(BloomFilterReader, FileNotHaveBloomFilter) {
// Can still get a BloomFilterReader and a RowGroupBloomFilter
// reader, but cannot get a non-null BloomFilter.
std::string dir_string(parquet::test::get_data_dir());
std::string path = dir_string + "/alltypes_plain.parquet";
auto reader = ParquetFileReader::OpenFile(path, false);
auto file_metadata = reader->metadata();
EXPECT_FALSE(file_metadata->is_encryption_algorithm_set());
auto& bloom_filter_reader = reader->GetBloomFilterReader();
auto row_group_0 = bloom_filter_reader.RowGroup(0);
ASSERT_NE(nullptr, row_group_0);
EXPECT_THROW(bloom_filter_reader.RowGroup(1), ParquetException);
auto bloom_filter = row_group_0->GetColumnBloomFilter(0);
ASSERT_EQ(nullptr, bloom_filter);
}

// <c1:BYTE_ARRAY, c2:BYTE_ARRAY>, c1 has bloom filter.
TEST(BloomFilterBuilderTest, BasicRoundTrip) {
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
SchemaDescriptor schema;
schema::NodePtr root = schema::GroupNode::Make(
"schema", Repetition::REPEATED, {schema::ByteArray("c1"), schema::ByteArray("c2")});
schema.Init(root);
auto writer_properties = default_writer_properties();
auto builder = BloomFilterBuilder::Make(&schema, *writer_properties);
builder->AppendRowGroup();
BloomFilterOptions bloom_filter_options;
bloom_filter_options.ndv = 100;
auto bloom_filter = builder->GetOrCreateBloomFilter(0, bloom_filter_options);
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
ASSERT_NE(nullptr, bloom_filter);
ASSERT_EQ(bloom_filter->GetBitsetSize(),
BlockSplitBloomFilter::OptimalNumOfBytes(bloom_filter_options.ndv,
bloom_filter_options.fpp));
std::vector<uint64_t> insert_hashes = {100, 200};
for (uint64_t hash : insert_hashes) {
bloom_filter->InsertHash(hash);
}
builder->Finish();
auto sink = CreateOutputStream();
BloomFilterLocation location;
builder->WriteTo(sink.get(), &location);
EXPECT_EQ(1, location.bloom_filter_location.size());
EXPECT_EQ(2, location.bloom_filter_location[0].size());
EXPECT_TRUE(location.bloom_filter_location[0][0].has_value());
EXPECT_FALSE(location.bloom_filter_location[0][1].has_value());

int32_t bloom_filter_offset = location.bloom_filter_location[0][0]->offset;
int32_t bloom_filter_length = location.bloom_filter_location[0][0]->length;

ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
ReaderProperties reader_properties;
::arrow::io::BufferReader reader(
::arrow::SliceBuffer(buffer, bloom_filter_offset, bloom_filter_length));
auto filter = parquet::BlockSplitBloomFilter::Deserialize(reader_properties, &reader);
for (uint64_t hash : insert_hashes) {
EXPECT_TRUE(bloom_filter->FindHash(hash));
}
EXPECT_FALSE(filter.FindHash(300));
}

TEST(BloomFilterBuilderTest, InvalidOperations) {
SchemaDescriptor schema;
schema::NodePtr root =
schema::GroupNode::Make("schema", Repetition::REPEATED, {schema::ByteArray("c1")});
schema.Init(root);
auto properties = WriterProperties::Builder().build();
auto builder = BloomFilterBuilder::Make(&schema, *properties);
// AppendRowGroup() is not called and expect throw.
BloomFilterOptions default_options;
ASSERT_THROW(builder->GetOrCreateBloomFilter(0, default_options), ParquetException);

builder->AppendRowGroup();
// GetOrCreateBloomFilter() with wrong column ordinal expect throw.
ASSERT_THROW(builder->GetOrCreateBloomFilter(1, default_options), ParquetException);
builder->GetOrCreateBloomFilter(0, default_options);
auto sink = CreateOutputStream();
BloomFilterLocation location;
// WriteTo() before Finish() expect throw.
ASSERT_THROW(builder->WriteTo(sink.get(), &location), ParquetException);
builder->Finish();
builder->WriteTo(sink.get(), &location);
EXPECT_EQ(1, location.bloom_filter_location.size());
}

} // namespace parquet::test
Loading