apache · mapleFU · Aug 26, 2023 · Aug 26, 2023 · Aug 26, 2023 · Aug 26, 2023
@@ -156,6 +156,7 @@ set(PARQUET_SRCS
     arrow/writer.cc
     bloom_filter.cc
     bloom_filter_reader.cc
+    bloom_filter_builder.cc
     column_reader.cc
     column_scanner.cc
     column_writer.cc
@@ -335,7 +336,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/parquet_version.h"
 add_parquet_test(internals-test
                  SOURCES
                  bloom_filter_test.cc
-                 bloom_filter_reader_test.cc
+                 bloom_filter_parquet_test.cc
                  properties_test.cc
                  statistics_test.cc
                  encoding_test.cc

@@ -167,6 +167,12 @@ class PARQUET_EXPORT BloomFilter {
 
   virtual ~BloomFilter() = default;
 
+  // Variant of const pointer argument to facilitate template
+  uint64_t Hash(const int32_t* value) const { return Hash(*value); }
+  uint64_t Hash(const int64_t* value) const { return Hash(*value); }
+  uint64_t Hash(const float* value) const { return Hash(*value); }
+  uint64_t Hash(const double* value) const { return Hash(*value); }
+
  protected:
   // Hash strategy available for Bloom filter.
   enum class HashStrategy : uint32_t { XXHASH = 0 };

@@ -0,0 +1,117 @@
+#include "parquet/bloom_filter_builder.h"
+
+#include <utility>
+
+#include "arrow/io/interfaces.h"
+
+#include "metadata.h"
+#include "parquet/bloom_filter.h"
+#include "parquet/exception.h"
+#include "parquet/properties.h"
+
+namespace parquet {
+
+class BloomFilterBuilderImpl : public BloomFilterBuilder {
+ public:
+  explicit BloomFilterBuilderImpl(const SchemaDescriptor* schema,
+                                  WriterProperties properties)
+      : schema_(schema), properties_(std::move(properties)) {}
+  /// Append a new row group to host all incoming bloom filters.
+  void AppendRowGroup() override;
+
+  BloomFilter* GetOrCreateBloomFilter(
+      int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) override;
+
+  /// Serialize all bloom filters with header and bitset in the order of row group and
+  /// column id. Column encryption is not implemented yet. The side effect is that it
+  /// deletes all bloom filters after they have been flushed.
+  void WriteTo(::arrow::io::OutputStream* sink,
+               BloomFilterLocation* location) override;
+
+  void Finish() override { finished_ = true; }
+
+ private:
+  /// Make sure column ordinal is not out of bound and the builder is in good state.
+  void CheckState(int32_t column_ordinal) const {
+    if (finished_) {
+      throw ParquetException("PageIndexBuilder is already finished.");
+    }
+    if (column_ordinal < 0 || column_ordinal >= schema_->num_columns()) {
+      throw ParquetException("Invalid column ordinal: ", column_ordinal);
+    }
+    if (row_group_bloom_filters_.empty()) {
+      throw ParquetException("No row group appended to PageIndexBuilder.");
+    }
+  }
+
+  const SchemaDescriptor* schema_;
+  WriterProperties properties_;
+  bool finished_ = false;
+
+  // vector: row_group_ordinal
+  // map: column_ordinal -> bloom filter
+  std::vector<std::map<int32_t, std::unique_ptr<BloomFilter>>> row_group_bloom_filters_;
+};
+
+std::unique_ptr<BloomFilterBuilder> BloomFilterBuilder::Make(
+    const SchemaDescriptor* schema, const WriterProperties& properties) {
+  return std::unique_ptr<BloomFilterBuilder>(
+      new BloomFilterBuilderImpl(schema, properties));
+}
+
+void BloomFilterBuilderImpl::AppendRowGroup() { row_group_bloom_filters_.emplace_back(); }
+
+BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter(
+    int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) {
+  CheckState(column_ordinal);
+  std::unique_ptr<BloomFilter>& bloom_filter =
+      row_group_bloom_filters_.back()[column_ordinal];
+  if (bloom_filter == nullptr) {
+    auto block_split_bloom_filter =
+        std::make_unique<BlockSplitBloomFilter>(properties_.memory_pool());
+    block_split_bloom_filter->Init(BlockSplitBloomFilter::OptimalNumOfBytes(
+        bloom_filter_options.ndv, bloom_filter_options.fpp));
+    bloom_filter = std::move(block_split_bloom_filter);
+  }
+  return bloom_filter.get();
+}
+
+void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink,
+                                     BloomFilterLocation* location) {
+  if (!finished_) {
+    throw ParquetException("Cannot call WriteTo() to unfinished PageIndexBuilder.");
+  }
+  if (row_group_bloom_filters_.empty()) {
+    // Return quickly if there is no bloom filter
+    return;
+  }
+
+  for (size_t row_group_ordinal = 0; row_group_ordinal < row_group_bloom_filters_.size();
+       ++row_group_ordinal) {
+    const auto& row_group_bloom_filters = row_group_bloom_filters_[row_group_ordinal];
+    // the whole row group has no bloom filter
+    if (row_group_bloom_filters.empty()) {
+      continue;
+    }
+    bool has_valid_bloom_filter = false;
+    int num_columns = schema_->num_columns();
+    std::vector<std::optional<IndexLocation>> locations(num_columns, std::nullopt);
+
+    // serialize bloom filter by ascending order of column id
+    for (int32_t column_id = 0; column_id < num_columns; ++column_id) {
+      auto iter = row_group_bloom_filters.find(column_id);
+      if (iter != row_group_bloom_filters.cend() && iter->second != nullptr) {
+        PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell());
+        iter->second->WriteTo(sink);
+        PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell());
+        has_valid_bloom_filter = true;
+        locations[column_id] = IndexLocation{offset, static_cast<int32_t>(pos - offset)};
+      }
+    }
+    if (has_valid_bloom_filter) {
+      location->bloom_filter_location.emplace(row_group_ordinal, std::move(locations));
+    }
+  }
+}
+
+}  // namespace parquet
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#pragma once
+
+#include "arrow/io/interfaces.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class BloomFilter;
+class SchemaDescriptor;
+struct BloomFilterOptions;
+struct BloomFilterLocation;
+
+namespace schema {
+class ColumnPath;
+}
+
+/// \brief Interface for collecting bloom filter of a parquet file.
+class PARQUET_EXPORT BloomFilterBuilder {
+ public:
+  /// \brief API convenience to create a BloomFilterBuilder.
+  static std::unique_ptr<BloomFilterBuilder> Make(const SchemaDescriptor* schema,
+                                                  const WriterProperties& properties);
+
+  /// Append a new row group to host all incoming bloom filters.
+  virtual void AppendRowGroup() = 0;
+
+  /// \brief Get the BloomFilter from column ordinal.
+  ///
+  /// \param column_ordinal Column ordinal in schema, which is only for leaf columns.
+  /// \return ColumnIndexBuilder for the column and its memory ownership belongs to
+  /// the PageIndexBuilder.
+  virtual BloomFilter* GetOrCreateBloomFilter(
+      int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) = 0;
+
+  /// \brief Write the bloom filter to sink.
+  ///
+  /// \param[out] sink The output stream to write the bloom filter.
+  /// \param[out] location The location of all page index to the start of sink.
+  virtual void WriteTo(::arrow::io::OutputStream* sink,
+                       BloomFilterLocation* location) = 0;
+
+  /// \brief Complete the bloom filter builder and no more write is allowed.
+  virtual void Finish() = 0;
+
+  virtual ~BloomFilterBuilder() = default;
+};
+
+}  // namespace parquet
diff --git a/cpp/src/parquet/bloom_filter_parquet_test.cc b/cpp/src/parquet/bloom_filter_parquet_test.cc
@@ -0,0 +1,141 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <arrow/testing/gtest_util.h>
+#include <gtest/gtest.h>
+
+#include "parquet/bloom_filter.h"
+#include "parquet/bloom_filter_builder.h"
+#include "parquet/bloom_filter_reader.h"
+#include "parquet/file_reader.h"
+#include "parquet/test_util.h"
+
+namespace parquet::test {
+
+TEST(BloomFilterReader, ReadBloomFilter) {
+  std::string dir_string(parquet::test::get_data_dir());
+  std::string path = dir_string + "/data_index_bloom_encoding_stats.parquet";
+  auto reader = ParquetFileReader::OpenFile(path, false);
+  auto file_metadata = reader->metadata();
+  EXPECT_FALSE(file_metadata->is_encryption_algorithm_set());
+  auto& bloom_filter_reader = reader->GetBloomFilterReader();
+  auto row_group_0 = bloom_filter_reader.RowGroup(0);
+  ASSERT_NE(nullptr, row_group_0);
+  EXPECT_THROW(bloom_filter_reader.RowGroup(1), ParquetException);
+  auto bloom_filter = row_group_0->GetColumnBloomFilter(0);
+  ASSERT_NE(nullptr, bloom_filter);
+  EXPECT_THROW(row_group_0->GetColumnBloomFilter(1), ParquetException);
+
+  // assert exists
+  {
+    std::string_view sv = "Hello";
+    ByteArray ba{sv};
+    EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(&ba)));
+  }
+
+  // no exists
+  {
+    std::string_view sv = "NOT_EXISTS";
+    ByteArray ba{sv};
+    EXPECT_FALSE(bloom_filter->FindHash(bloom_filter->Hash(&ba)));
+  }
+}
+
+TEST(BloomFilterReader, FileNotHaveBloomFilter) {
+  // Can still get a BloomFilterReader and a RowGroupBloomFilter
+  // reader, but cannot get a non-null BloomFilter.
+  std::string dir_string(parquet::test::get_data_dir());
+  std::string path = dir_string + "/alltypes_plain.parquet";
+  auto reader = ParquetFileReader::OpenFile(path, false);
+  auto file_metadata = reader->metadata();
+  EXPECT_FALSE(file_metadata->is_encryption_algorithm_set());
+  auto& bloom_filter_reader = reader->GetBloomFilterReader();
+  auto row_group_0 = bloom_filter_reader.RowGroup(0);
+  ASSERT_NE(nullptr, row_group_0);
+  EXPECT_THROW(bloom_filter_reader.RowGroup(1), ParquetException);
+  auto bloom_filter = row_group_0->GetColumnBloomFilter(0);
+  ASSERT_EQ(nullptr, bloom_filter);
+}
+
+// <c1:BYTE_ARRAY, c2:BYTE_ARRAY>, c1 has bloom filter.
+TEST(BloomFilterBuilderTest, BasicRoundTrip) {
+  SchemaDescriptor schema;
+  schema::NodePtr root = schema::GroupNode::Make(
+      "schema", Repetition::REPEATED, {schema::ByteArray("c1"), schema::ByteArray("c2")});
+  schema.Init(root);
+  auto writer_properties = default_writer_properties();
+  auto builder = BloomFilterBuilder::Make(&schema, *writer_properties);
+  builder->AppendRowGroup();
+  BloomFilterOptions bloom_filter_options;
+  bloom_filter_options.ndv = 100;
+  auto bloom_filter = builder->GetOrCreateBloomFilter(0, bloom_filter_options);
+  ASSERT_NE(nullptr, bloom_filter);
+  ASSERT_EQ(bloom_filter->GetBitsetSize(),
+            BlockSplitBloomFilter::OptimalNumOfBytes(bloom_filter_options.ndv,
+                                                     bloom_filter_options.fpp));
+  std::vector<uint64_t> insert_hashes = {100, 200};
+  for (uint64_t hash : insert_hashes) {
+    bloom_filter->InsertHash(hash);
+  }
+  builder->Finish();
+  auto sink = CreateOutputStream();
+  BloomFilterLocation location;
+  builder->WriteTo(sink.get(), &location);
+  EXPECT_EQ(1, location.bloom_filter_location.size());
+  EXPECT_EQ(2, location.bloom_filter_location[0].size());
+  EXPECT_TRUE(location.bloom_filter_location[0][0].has_value());
+  EXPECT_FALSE(location.bloom_filter_location[0][1].has_value());
+
+  int32_t bloom_filter_offset = location.bloom_filter_location[0][0]->offset;
+  int32_t bloom_filter_length = location.bloom_filter_location[0][0]->length;
+
+  ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
+  ReaderProperties reader_properties;
+  ::arrow::io::BufferReader reader(
+      ::arrow::SliceBuffer(buffer, bloom_filter_offset, bloom_filter_length));
+  auto filter = parquet::BlockSplitBloomFilter::Deserialize(reader_properties, &reader);
+  for (uint64_t hash : insert_hashes) {
+    EXPECT_TRUE(bloom_filter->FindHash(hash));
+  }
+  EXPECT_FALSE(filter.FindHash(300));
+}
+
+TEST(BloomFilterBuilderTest, InvalidOperations) {
+  SchemaDescriptor schema;
+  schema::NodePtr root =
+      schema::GroupNode::Make("schema", Repetition::REPEATED, {schema::ByteArray("c1")});
+  schema.Init(root);
+  auto properties = WriterProperties::Builder().build();
+  auto builder = BloomFilterBuilder::Make(&schema, *properties);
+  // AppendRowGroup() is not called and expect throw.
+  BloomFilterOptions default_options;
+  ASSERT_THROW(builder->GetOrCreateBloomFilter(0, default_options), ParquetException);
+
+  builder->AppendRowGroup();
+  // GetOrCreateBloomFilter() with wrong column ordinal expect throw.
+  ASSERT_THROW(builder->GetOrCreateBloomFilter(1, default_options), ParquetException);
+  builder->GetOrCreateBloomFilter(0, default_options);
+  auto sink = CreateOutputStream();
+  BloomFilterLocation location;
+  // WriteTo() before Finish() expect throw.
+  ASSERT_THROW(builder->WriteTo(sink.get(), &location), ParquetException);
+  builder->Finish();
+  builder->WriteTo(sink.get(), &location);
+  EXPECT_EQ(1, location.bloom_filter_location.size());
+}
+
+}  // namespace parquet::test