From c7b7690f37c482090f075b2c855fe45a619bff01 Mon Sep 17 00:00:00 2001 From: Tishj Date: Tue, 7 Jan 2025 13:32:47 +0100 Subject: [PATCH] dict_fsst start --- src/common/enums/compression_type.cpp | 8 +- src/function/compression_config.cpp | 2 + .../duckdb/common/enums/compression_type.hpp | 3 +- .../function/compression/compression.hpp | 5 + .../storage/compression/dict_fsst/analyze.hpp | 50 +++++ .../storage/compression/dict_fsst/common.hpp | 64 +++++++ .../compression/dict_fsst/compression.hpp | 64 +++++++ .../compression/dict_fsst/decompression.hpp | 54 ++++++ .../compression/dictionary/analyze.hpp | 2 + .../storage/compression/dictionary/common.hpp | 2 + .../compression/dictionary/compression.hpp | 2 + .../compression/dictionary/decompression.hpp | 2 + src/storage/compression/CMakeLists.txt | 2 + src/storage/compression/dict_fsst.cpp | 140 ++++++++++++++ .../compression/dict_fsst/CMakeLists.txt | 5 + src/storage/compression/dict_fsst/analyze.cpp | 56 ++++++ src/storage/compression/dict_fsst/common.cpp | 92 +++++++++ .../compression/dict_fsst/compression.cpp | 174 ++++++++++++++++++ .../compression/dict_fsst/decompression.cpp | 126 +++++++++++++ .../compression/dictionary/analyze.cpp | 2 + src/storage/compression/dictionary/common.cpp | 2 + .../compression/dictionary/compression.cpp | 2 + .../compression/dictionary/decompression.cpp | 2 + .../compression/dictionary_compression.cpp | 17 +- 24 files changed, 870 insertions(+), 8 deletions(-) create mode 100644 src/include/duckdb/storage/compression/dict_fsst/analyze.hpp create mode 100644 src/include/duckdb/storage/compression/dict_fsst/common.hpp create mode 100644 src/include/duckdb/storage/compression/dict_fsst/compression.hpp create mode 100644 src/include/duckdb/storage/compression/dict_fsst/decompression.hpp create mode 100644 src/storage/compression/dict_fsst.cpp create mode 100644 src/storage/compression/dict_fsst/CMakeLists.txt create mode 100644 src/storage/compression/dict_fsst/analyze.cpp create mode 100644 src/storage/compression/dict_fsst/common.cpp create mode 100644 src/storage/compression/dict_fsst/compression.cpp create mode 100644 src/storage/compression/dict_fsst/decompression.cpp diff --git a/src/common/enums/compression_type.cpp b/src/common/enums/compression_type.cpp index 3bc66fe08b36..dda1e8bec8c7 100644 --- a/src/common/enums/compression_type.cpp +++ b/src/common/enums/compression_type.cpp @@ -19,7 +19,9 @@ vector ListCompressionTypes(void) { bool CompressionTypeIsDeprecated(CompressionType compression_type) { const bool is_patas = compression_type == CompressionType::COMPRESSION_PATAS; const bool is_chimp = compression_type == CompressionType::COMPRESSION_CHIMP; - return (is_patas || is_chimp); + const bool is_dict = compression_type == CompressionType::COMPRESSION_DICTIONARY; + const bool is_fsst = compression_type == CompressionType::COMPRESSION_FSST; + return (is_patas || is_chimp || is_dict || is_fsst); } CompressionType CompressionTypeFromString(const string &str) { @@ -50,6 +52,8 @@ CompressionType CompressionTypeFromString(const string &str) { return CompressionType::COMPRESSION_ALPRD; } else if (compression == "roaring") { return CompressionType::COMPRESSION_ROARING; + } else if (compression == "dict_fsst") { + return CompressionType::COMPRESSION_DICT_FSST; } else { return CompressionType::COMPRESSION_AUTO; } @@ -85,6 +89,8 @@ string CompressionTypeToString(CompressionType type) { return "ALPRD"; case CompressionType::COMPRESSION_ROARING: return "Roaring"; + case CompressionType::COMPRESSION_DICT_FSST: + return "DICT_FSST"; case CompressionType::COMPRESSION_EMPTY: return "Empty Validity"; default: diff --git a/src/function/compression_config.cpp b/src/function/compression_config.cpp index deece1a85ec4..fcd1413336a2 100644 --- a/src/function/compression_config.cpp +++ b/src/function/compression_config.cpp @@ -30,6 +30,8 @@ static const DefaultCompressionMethod internal_compression_methods[] = { {CompressionType::COMPRESSION_ROARING, RoaringCompressionFun::GetFunction, RoaringCompressionFun::TypeIsSupported}, {CompressionType::COMPRESSION_EMPTY, EmptyValidityCompressionFun::GetFunction, EmptyValidityCompressionFun::TypeIsSupported}, + {CompressionType::COMPRESSION_DICT_FSST, DictFSSTCompressionFun::GetFunction, + DictFSSTCompressionFun::TypeIsSupported}, {CompressionType::COMPRESSION_AUTO, nullptr, nullptr}}; static optional_ptr FindCompressionFunction(CompressionFunctionSet &set, CompressionType type, diff --git a/src/include/duckdb/common/enums/compression_type.hpp b/src/include/duckdb/common/enums/compression_type.hpp index a8753f12ba61..a9f50bc7b76d 100644 --- a/src/include/duckdb/common/enums/compression_type.hpp +++ b/src/include/duckdb/common/enums/compression_type.hpp @@ -29,7 +29,8 @@ enum class CompressionType : uint8_t { COMPRESSION_ZSTD = 12, COMPRESSION_ROARING = 13, COMPRESSION_EMPTY = 14, // internal only - COMPRESSION_COUNT // This has to stay the last entry of the type! + COMPRESSION_DICT_FSST = 15, + COMPRESSION_COUNT // This has to stay the last entry of the type! }; bool CompressionTypeIsDeprecated(CompressionType compression_type); diff --git a/src/include/duckdb/function/compression/compression.hpp b/src/include/duckdb/function/compression/compression.hpp index adf5364b7d24..337b0d19c44e 100644 --- a/src/include/duckdb/function/compression/compression.hpp +++ b/src/include/duckdb/function/compression/compression.hpp @@ -38,6 +38,11 @@ struct DictionaryCompressionFun { static bool TypeIsSupported(const PhysicalType physical_type); }; +struct DictFSSTCompressionFun { + static CompressionFunction GetFunction(PhysicalType type); + static bool TypeIsSupported(const PhysicalType physical_type); +}; + struct ChimpCompressionFun { static CompressionFunction GetFunction(PhysicalType type); static bool TypeIsSupported(const PhysicalType physical_type); diff --git a/src/include/duckdb/storage/compression/dict_fsst/analyze.hpp b/src/include/duckdb/storage/compression/dict_fsst/analyze.hpp new file mode 100644 index 000000000000..7a565efa3017 --- /dev/null +++ b/src/include/duckdb/storage/compression/dict_fsst/analyze.hpp @@ -0,0 +1,50 @@ +#pragma once + +#include "duckdb/storage/compression/dictionary/common.hpp" +#include "duckdb/common/string_map_set.hpp" +#include "duckdb/storage/table/column_data.hpp" + +namespace duckdb { + +namespace dict_fsst { + +//===--------------------------------------------------------------------===// +// Analyze +//===--------------------------------------------------------------------===// +struct DictFSSTAnalyzeState : public DictFSSTCompressionState { +public: + explicit DictFSSTAnalyzeState(const CompressionInfo &info); + +public: + bool LookupString(string_t str) override; + void AddNewString(string_t str) override; + void AddLastLookup() override; + void AddNull() override; + bool CalculateSpaceRequirements(bool new_string, idx_t string_size) override; + void Flush(bool final = false) override; + void Verify() override; + +public: + idx_t segment_count; + idx_t current_tuple_count; + idx_t current_unique_count; + idx_t current_dict_size; + StringHeap heap; + string_set_t current_set; + bitpacking_width_t current_width; + bitpacking_width_t next_width; +}; + +struct DictFSSTCompressionAnalyzeState : public AnalyzeState { +public: + explicit DictFSSTCompressionAnalyzeState(const CompressionInfo &info) + : AnalyzeState(info), analyze_state(make_uniq(info)) { + } + +public: + unique_ptr analyze_state; +}; + +} // namespace dict_fsst + +} // namespace duckdb diff --git a/src/include/duckdb/storage/compression/dict_fsst/common.hpp b/src/include/duckdb/storage/compression/dict_fsst/common.hpp new file mode 100644 index 000000000000..fa4499c6707b --- /dev/null +++ b/src/include/duckdb/storage/compression/dict_fsst/common.hpp @@ -0,0 +1,64 @@ +#pragma once + +#include "duckdb/common/typedefs.hpp" +#include "duckdb/function/compression_function.hpp" +#include "duckdb/common/bitpacking.hpp" +#include "duckdb/storage/string_uncompressed.hpp" + +namespace duckdb { + +namespace dict_fsst { + +typedef struct { + uint32_t dict_size; + uint32_t dict_end; + uint32_t index_buffer_offset; + uint32_t index_buffer_count; + uint32_t bitpacking_width; +} dict_fsst_compression_header_t; + +struct DictFSSTCompression { +public: + static constexpr float MINIMUM_COMPRESSION_RATIO = 1.2F; + //! Dictionary header size at the beginning of the string segment (offset + length) + static constexpr uint16_t DICTIONARY_HEADER_SIZE = sizeof(dict_fsst_compression_header_t); + +public: + static bool HasEnoughSpace(idx_t current_count, idx_t index_count, idx_t dict_size, + bitpacking_width_t packing_width, const idx_t block_size); + static idx_t RequiredSpace(idx_t current_count, idx_t index_count, idx_t dict_size, + bitpacking_width_t packing_width); + + static StringDictionaryContainer GetDictionary(ColumnSegment &segment, BufferHandle &handle); + static void SetDictionary(ColumnSegment &segment, BufferHandle &handle, StringDictionaryContainer container); +}; + +//! Abstract class managing the compression state for size analysis or compression. +class DictFSSTCompressionState : public CompressionState { +public: + explicit DictFSSTCompressionState(const CompressionInfo &info); + ~DictFSSTCompressionState() override; + +public: + bool UpdateState(Vector &scan_vector, idx_t count); + +protected: + // Should verify the State + virtual void Verify() = 0; + // Performs a lookup of str, storing the result internally + virtual bool LookupString(string_t str) = 0; + // Add the most recently looked up str to compression state + virtual void AddLastLookup() = 0; + // Add string to the state that is known to not be seen yet + virtual void AddNewString(string_t str) = 0; + // Add a null value to the compression state + virtual void AddNull() = 0; + // Needs to be called before adding a value. Will return false if a flush is required first. + virtual bool CalculateSpaceRequirements(bool new_string, idx_t string_size) = 0; + // Flush the segment to disk if compressing or reset the counters if analyzing + virtual void Flush(bool final = false) = 0; +}; + +} // namespace dict_fsst + +} // namespace duckdb diff --git a/src/include/duckdb/storage/compression/dict_fsst/compression.hpp b/src/include/duckdb/storage/compression/dict_fsst/compression.hpp new file mode 100644 index 000000000000..725c816222c2 --- /dev/null +++ b/src/include/duckdb/storage/compression/dict_fsst/compression.hpp @@ -0,0 +1,64 @@ +#pragma once + +#include "duckdb/common/typedefs.hpp" +#include "duckdb/storage/compression/dictionary/common.hpp" +#include "duckdb/function/compression_function.hpp" +#include "duckdb/common/string_map_set.hpp" +#include "duckdb/storage/table/column_data_checkpointer.hpp" + +namespace duckdb { +namespace dict_fsst { + +// Dictionary compression uses a combination of bitpacking and a dictionary to compress string segments. The data is +// stored across three buffers: the index buffer, the selection buffer and the dictionary. Firstly the Index buffer +// contains the offsets into the dictionary which are also used to determine the string lengths. Each value in the +// dictionary gets a single unique index in the index buffer. Secondly, the selection buffer maps the tuples to an index +// in the index buffer. The selection buffer is compressed with bitpacking. Finally, the dictionary contains simply all +// the unique strings without lengths or null termination as we can deduce the lengths from the index buffer. The +// addition of the selection buffer is done for two reasons: firstly, to allow the scan to emit dictionary vectors by +// scanning the whole dictionary at once and then scanning the selection buffer for each emitted vector. Secondly, it +// allows for efficient bitpacking compression as the selection values should remain relatively small. + +//===--------------------------------------------------------------------===// +// Compress +//===--------------------------------------------------------------------===// +struct DictFSSTCompressionCompressState : public DictFSSTCompressionState { +public: + DictFSSTCompressionCompressState(ColumnDataCheckpointData &checkpoint_data_p, const CompressionInfo &info); + +public: + void CreateEmptySegment(idx_t row_start); + void Verify() override; + bool LookupString(string_t str) override; + void AddNewString(string_t str) override; + void AddNull() override; + void AddLastLookup() override; + bool CalculateSpaceRequirements(bool new_string, idx_t string_size) override; + void Flush(bool final = false) override; + idx_t Finalize(); + +public: + ColumnDataCheckpointData &checkpoint_data; + CompressionFunction &function; + + // State regarding current segment + unique_ptr current_segment; + BufferHandle current_handle; + StringDictionaryContainer current_dictionary; + data_ptr_t current_end_ptr; + + // Buffers and map for current segment + StringHeap heap; + string_map_t current_string_map; + vector index_buffer; + vector selection_buffer; + + bitpacking_width_t current_width = 0; + bitpacking_width_t next_width = 0; + + // Result of latest LookupString call + uint32_t latest_lookup_result; +}; + +} // namespace dict_fsst +} // namespace duckdb diff --git a/src/include/duckdb/storage/compression/dict_fsst/decompression.hpp b/src/include/duckdb/storage/compression/dict_fsst/decompression.hpp new file mode 100644 index 000000000000..96c67e9b89cf --- /dev/null +++ b/src/include/duckdb/storage/compression/dict_fsst/decompression.hpp @@ -0,0 +1,54 @@ +#pragma once + +#include "duckdb/storage/compression/dict_fsst/common.hpp" + +namespace duckdb { + +namespace dict_fsst { + +//===--------------------------------------------------------------------===// +// Scan +//===--------------------------------------------------------------------===// +// FIXME: why is this StringScanState when we also define: `BufferHandle handle` ??? +struct CompressedStringScanState : public StringScanState { +public: + explicit CompressedStringScanState(BufferHandle &&handle_p) + : StringScanState(), owned_handle(std::move(handle_p)), handle(owned_handle) { + } + explicit CompressedStringScanState(BufferHandle &handle_p) : StringScanState(), owned_handle(), handle(handle_p) { + } + +public: + void Initialize(ColumnSegment &segment, bool initialize_dictionary = true); + void ScanToFlatVector(Vector &result, idx_t result_offset, idx_t start, idx_t scan_count); + void ScanToDictionaryVector(ColumnSegment &segment, Vector &result, idx_t result_offset, idx_t start, + idx_t scan_count); + +private: + string_t FetchStringFromDict(int32_t dict_offset, uint16_t string_len); + uint16_t GetStringLength(sel_t index); + +public: + BufferHandle owned_handle; + optional_ptr handle; + + bitpacking_width_t current_width; + buffer_ptr sel_vec; + idx_t sel_vec_size = 0; + + //! Start of the block (pointing to the dictionary_header) + data_ptr_t baseptr; + //! Start of the data (pointing to the start of the selection buffer) + data_ptr_t base_data; + uint32_t *index_buffer_ptr; + uint32_t index_buffer_count; + + buffer_ptr dictionary; + idx_t dictionary_size; + StringDictionaryContainer dict; + idx_t block_size; +}; + +} // namespace dict_fsst + +} // namespace duckdb diff --git a/src/include/duckdb/storage/compression/dictionary/analyze.hpp b/src/include/duckdb/storage/compression/dictionary/analyze.hpp index 99eb72156f9b..dc695b2f1d2a 100644 --- a/src/include/duckdb/storage/compression/dictionary/analyze.hpp +++ b/src/include/duckdb/storage/compression/dictionary/analyze.hpp @@ -5,6 +5,7 @@ #include "duckdb/storage/table/column_data.hpp" namespace duckdb { +namespace dictionary { //===--------------------------------------------------------------------===// // Analyze @@ -43,4 +44,5 @@ struct DictionaryCompressionAnalyzeState : public AnalyzeState { unique_ptr analyze_state; }; +} // namespace dictionary } // namespace duckdb diff --git a/src/include/duckdb/storage/compression/dictionary/common.hpp b/src/include/duckdb/storage/compression/dictionary/common.hpp index 79bd094bfa07..fbb9eebc24a4 100644 --- a/src/include/duckdb/storage/compression/dictionary/common.hpp +++ b/src/include/duckdb/storage/compression/dictionary/common.hpp @@ -6,6 +6,7 @@ #include "duckdb/storage/string_uncompressed.hpp" namespace duckdb { +namespace dictionary { typedef struct { uint32_t dict_size; @@ -57,4 +58,5 @@ class DictionaryCompressionState : public CompressionState { virtual void Flush(bool final = false) = 0; }; +} // namespace dictionary } // namespace duckdb diff --git a/src/include/duckdb/storage/compression/dictionary/compression.hpp b/src/include/duckdb/storage/compression/dictionary/compression.hpp index b0f29dc5901f..c6712a46c677 100644 --- a/src/include/duckdb/storage/compression/dictionary/compression.hpp +++ b/src/include/duckdb/storage/compression/dictionary/compression.hpp @@ -7,6 +7,7 @@ #include "duckdb/storage/table/column_data_checkpointer.hpp" namespace duckdb { +namespace dictionary { // Dictionary compression uses a combination of bitpacking and a dictionary to compress string segments. The data is // stored across three buffers: the index buffer, the selection buffer and the dictionary. Firstly the Index buffer @@ -59,4 +60,5 @@ struct DictionaryCompressionCompressState : public DictionaryCompressionState { uint32_t latest_lookup_result; }; +} // namespace dictionary } // namespace duckdb diff --git a/src/include/duckdb/storage/compression/dictionary/decompression.hpp b/src/include/duckdb/storage/compression/dictionary/decompression.hpp index 1656ec718fd8..84d3861b56c4 100644 --- a/src/include/duckdb/storage/compression/dictionary/decompression.hpp +++ b/src/include/duckdb/storage/compression/dictionary/decompression.hpp @@ -3,6 +3,7 @@ #include "duckdb/storage/compression/dictionary/common.hpp" namespace duckdb { +namespace dictionary { //===--------------------------------------------------------------------===// // Scan @@ -47,4 +48,5 @@ struct CompressedStringScanState : public StringScanState { idx_t block_size; }; +} // namespace dictionary } // namespace duckdb diff --git a/src/storage/compression/CMakeLists.txt b/src/storage/compression/CMakeLists.txt index 056811974ae1..ca83b2a3fa34 100644 --- a/src/storage/compression/CMakeLists.txt +++ b/src/storage/compression/CMakeLists.txt @@ -2,6 +2,7 @@ add_subdirectory(chimp) add_subdirectory(alp) add_subdirectory(roaring) add_subdirectory(dictionary) +add_subdirectory(dict_fsst) add_library_unity( duckdb_storage_compression @@ -19,6 +20,7 @@ add_library_unity( zstd.cpp alprd.cpp fsst.cpp + dict_fsst.cpp empty_validity.cpp) set(ALL_OBJECT_FILES ${ALL_OBJECT_FILES} $ diff --git a/src/storage/compression/dict_fsst.cpp b/src/storage/compression/dict_fsst.cpp new file mode 100644 index 000000000000..068f9332e353 --- /dev/null +++ b/src/storage/compression/dict_fsst.cpp @@ -0,0 +1,140 @@ +#include "duckdb/storage/compression/dict_fsst/common.hpp" +#include "duckdb/storage/compression/dict_fsst/analyze.hpp" +#include "duckdb/storage/compression/dict_fsst/compression.hpp" +#include "duckdb/storage/compression/dict_fsst/decompression.hpp" +#include "duckdb/function/compression/compression.hpp" +#include "duckdb/function/compression_function.hpp" + +namespace duckdb { +namespace dict_fsst { + +struct DictFSSTCompressionStorage { + static unique_ptr StringInitAnalyze(ColumnData &col_data, PhysicalType type); + static bool StringAnalyze(AnalyzeState &state_p, Vector &input, idx_t count); + static idx_t StringFinalAnalyze(AnalyzeState &state_p); + + static unique_ptr InitCompression(ColumnDataCheckpointData &checkpoint_data, + unique_ptr state); + static void Compress(CompressionState &state_p, Vector &scan_vector, idx_t count); + static void FinalizeCompress(CompressionState &state_p); + + static unique_ptr StringInitScan(ColumnSegment &segment); + template + static void StringScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result, + idx_t result_offset); + static void StringScan(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result); + static void StringFetchRow(ColumnSegment &segment, ColumnFetchState &state, row_t row_id, Vector &result, + idx_t result_idx); +}; + +//===--------------------------------------------------------------------===// +// Analyze +//===--------------------------------------------------------------------===// +unique_ptr DictFSSTCompressionStorage::StringInitAnalyze(ColumnData &col_data, PhysicalType type) { + CompressionInfo info(col_data.GetBlockManager().GetBlockSize()); + return make_uniq(info); +} + +bool DictFSSTCompressionStorage::StringAnalyze(AnalyzeState &state_p, Vector &input, idx_t count) { + auto &state = state_p.Cast(); + return state.analyze_state->UpdateState(input, count); +} + +idx_t DictFSSTCompressionStorage::StringFinalAnalyze(AnalyzeState &state_p) { + auto &analyze_state = state_p.Cast(); + auto &state = *analyze_state.analyze_state; + + auto width = BitpackingPrimitives::MinimumBitWidth(state.current_unique_count + 1); + auto req_space = DictFSSTCompression::RequiredSpace(state.current_tuple_count, state.current_unique_count, + state.current_dict_size, width); + + const auto total_space = state.segment_count * state.info.GetBlockSize() + req_space; + return LossyNumericCast(DictFSSTCompression::MINIMUM_COMPRESSION_RATIO * float(total_space)); +} + +//===--------------------------------------------------------------------===// +// Compress +//===--------------------------------------------------------------------===// +unique_ptr DictFSSTCompressionStorage::InitCompression(ColumnDataCheckpointData &checkpoint_data, + unique_ptr state) { + return make_uniq(checkpoint_data, state->info); +} + +void DictFSSTCompressionStorage::Compress(CompressionState &state_p, Vector &scan_vector, idx_t count) { + auto &state = state_p.Cast(); + state.UpdateState(scan_vector, count); +} + +void DictFSSTCompressionStorage::FinalizeCompress(CompressionState &state_p) { + auto &state = state_p.Cast(); + state.Flush(true); +} + +//===--------------------------------------------------------------------===// +// Scan +//===--------------------------------------------------------------------===// +unique_ptr DictFSSTCompressionStorage::StringInitScan(ColumnSegment &segment) { + auto &buffer_manager = BufferManager::GetBufferManager(segment.db); + auto state = make_uniq(buffer_manager.Pin(segment.block)); + state->Initialize(segment, true); + return std::move(state); +} + +//===--------------------------------------------------------------------===// +// Scan base data +//===--------------------------------------------------------------------===// +template +void DictFSSTCompressionStorage::StringScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, + Vector &result, idx_t result_offset) { + // clear any previously locked buffers and get the primary buffer handle + auto &scan_state = state.scan_state->Cast(); + + auto start = segment.GetRelativeIndex(state.row_index); + if (!ALLOW_DICT_VECTORS || scan_count != STANDARD_VECTOR_SIZE || + start % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE != 0) { + scan_state.ScanToFlatVector(result, result_offset, start, scan_count); + } else { + scan_state.ScanToDictionaryVector(segment, result, result_offset, start, scan_count); + } +} + +void DictFSSTCompressionStorage::StringScan(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, + Vector &result) { + StringScanPartial(segment, state, scan_count, result, 0); +} + +//===--------------------------------------------------------------------===// +// Fetch +//===--------------------------------------------------------------------===// +void DictFSSTCompressionStorage::StringFetchRow(ColumnSegment &segment, ColumnFetchState &state, row_t row_id, + Vector &result, idx_t result_idx) { + // fetch a single row from the string segment + CompressedStringScanState scan_state(state.GetOrInsertHandle(segment)); + scan_state.Initialize(segment, false); + scan_state.ScanToFlatVector(result, result_idx, NumericCast(row_id), 1); +} + +} // namespace dict_fsst + +//===--------------------------------------------------------------------===// +// Get Function +//===--------------------------------------------------------------------===// +CompressionFunction DictFSSTCompressionFun::GetFunction(PhysicalType data_type) { + auto res = CompressionFunction( + CompressionType::COMPRESSION_DICT_FSST, data_type, dict_fsst::DictFSSTCompressionStorage::StringInitAnalyze, + dict_fsst::DictFSSTCompressionStorage::StringAnalyze, dict_fsst::DictFSSTCompressionStorage::StringFinalAnalyze, + dict_fsst::DictFSSTCompressionStorage::InitCompression, dict_fsst::DictFSSTCompressionStorage::Compress, + dict_fsst::DictFSSTCompressionStorage::FinalizeCompress, dict_fsst::DictFSSTCompressionStorage::StringInitScan, + dict_fsst::DictFSSTCompressionStorage::StringScan, + dict_fsst::DictFSSTCompressionStorage::StringScanPartial, + dict_fsst::DictFSSTCompressionStorage::StringFetchRow, UncompressedFunctions::EmptySkip, + UncompressedStringStorage::StringInitSegment); + res.validity = CompressionValidity::NO_VALIDITY_REQUIRED; + return res; +} + +bool DictFSSTCompressionFun::TypeIsSupported(const PhysicalType physical_type) { + return physical_type == PhysicalType::VARCHAR; +} + +} // namespace duckdb diff --git a/src/storage/compression/dict_fsst/CMakeLists.txt b/src/storage/compression/dict_fsst/CMakeLists.txt new file mode 100644 index 000000000000..fa92e8d2951e --- /dev/null +++ b/src/storage/compression/dict_fsst/CMakeLists.txt @@ -0,0 +1,5 @@ +add_library_unity(duckdb_storage_compression_dict_fsst OBJECT common.cpp + analyze.cpp compression.cpp decompression.cpp) +set(ALL_OBJECT_FILES + ${ALL_OBJECT_FILES} $ + PARENT_SCOPE) diff --git a/src/storage/compression/dict_fsst/analyze.cpp b/src/storage/compression/dict_fsst/analyze.cpp new file mode 100644 index 000000000000..e55a9a4e6518 --- /dev/null +++ b/src/storage/compression/dict_fsst/analyze.cpp @@ -0,0 +1,56 @@ +#include "duckdb/storage/compression/dict_fsst/analyze.hpp" + +namespace duckdb { +namespace dict_fsst { + +DictFSSTAnalyzeState::DictFSSTAnalyzeState(const CompressionInfo &info) + : DictFSSTCompressionState(info), segment_count(0), current_tuple_count(0), current_unique_count(0), + current_dict_size(0), current_width(0), next_width(0) { +} + +bool DictFSSTAnalyzeState::LookupString(string_t str) { + return current_set.count(str); +} + +void DictFSSTAnalyzeState::AddNewString(string_t str) { + current_tuple_count++; + current_unique_count++; + current_dict_size += str.GetSize(); + if (str.IsInlined()) { + current_set.insert(str); + } else { + current_set.insert(heap.AddBlob(str)); + } + current_width = next_width; +} + +void DictFSSTAnalyzeState::AddLastLookup() { + current_tuple_count++; +} + +void DictFSSTAnalyzeState::AddNull() { + current_tuple_count++; +} + +bool DictFSSTAnalyzeState::CalculateSpaceRequirements(bool new_string, idx_t string_size) { + if (!new_string) { + return DictFSSTCompression::HasEnoughSpace(current_tuple_count + 1, current_unique_count, current_dict_size, + current_width, info.GetBlockSize()); + } + next_width = BitpackingPrimitives::MinimumBitWidth(current_unique_count + 2); // 1 for null, one for new string + return DictFSSTCompression::HasEnoughSpace(current_tuple_count + 1, current_unique_count + 1, + current_dict_size + string_size, next_width, info.GetBlockSize()); +} + +void DictFSSTAnalyzeState::Flush(bool final) { + segment_count++; + current_tuple_count = 0; + current_unique_count = 0; + current_dict_size = 0; + current_set.clear(); +} +void DictFSSTAnalyzeState::Verify() { +} + +} // namespace dict_fsst +} // namespace duckdb diff --git a/src/storage/compression/dict_fsst/common.cpp b/src/storage/compression/dict_fsst/common.cpp new file mode 100644 index 000000000000..ee5d9987283e --- /dev/null +++ b/src/storage/compression/dict_fsst/common.cpp @@ -0,0 +1,92 @@ +#include "duckdb/storage/compression/dict_fsst/common.hpp" + +namespace duckdb { +namespace dict_fsst { + +//===--------------------------------------------------------------------===// +// Helper Functions +//===--------------------------------------------------------------------===// +bool DictFSSTCompression::HasEnoughSpace(idx_t current_count, idx_t index_count, idx_t dict_size, + bitpacking_width_t packing_width, const idx_t block_size) { + return RequiredSpace(current_count, index_count, dict_size, packing_width) <= block_size; +} + +idx_t DictFSSTCompression::RequiredSpace(idx_t current_count, idx_t index_count, idx_t dict_size, + bitpacking_width_t packing_width) { + idx_t base_space = DICTIONARY_HEADER_SIZE + dict_size; + idx_t string_number_space = BitpackingPrimitives::GetRequiredSize(current_count, packing_width); + idx_t index_space = index_count * sizeof(uint32_t); + + idx_t used_space = base_space + index_space + string_number_space; + + return used_space; +} + +StringDictionaryContainer DictFSSTCompression::GetDictionary(ColumnSegment &segment, BufferHandle &handle) { + auto header_ptr = reinterpret_cast(handle.Ptr() + segment.GetBlockOffset()); + StringDictionaryContainer container; + container.size = Load(data_ptr_cast(&header_ptr->dict_size)); + container.end = Load(data_ptr_cast(&header_ptr->dict_end)); + return container; +} + +void DictFSSTCompression::SetDictionary(ColumnSegment &segment, BufferHandle &handle, + StringDictionaryContainer container) { + auto header_ptr = reinterpret_cast(handle.Ptr() + segment.GetBlockOffset()); + Store(container.size, data_ptr_cast(&header_ptr->dict_size)); + Store(container.end, data_ptr_cast(&header_ptr->dict_end)); +} + +DictFSSTCompressionState::DictFSSTCompressionState(const CompressionInfo &info) : CompressionState(info) { +} +DictFSSTCompressionState::~DictFSSTCompressionState() { +} + +bool DictFSSTCompressionState::UpdateState(Vector &scan_vector, idx_t count) { + UnifiedVectorFormat vdata; + scan_vector.ToUnifiedFormat(count, vdata); + auto data = UnifiedVectorFormat::GetData(vdata); + Verify(); + + for (idx_t i = 0; i < count; i++) { + auto idx = vdata.sel->get_index(i); + idx_t string_size = 0; + bool new_string = false; + auto row_is_valid = vdata.validity.RowIsValid(idx); + + if (row_is_valid) { + string_size = data[idx].GetSize(); + if (string_size >= StringUncompressed::GetStringBlockLimit(info.GetBlockSize())) { + // Big strings not implemented for dictionary compression + return false; + } + new_string = !LookupString(data[idx]); + } + + bool fits = CalculateSpaceRequirements(new_string, string_size); + if (!fits) { + Flush(); + new_string = true; + + fits = CalculateSpaceRequirements(new_string, string_size); + if (!fits) { + throw InternalException("Dictionary compression could not write to new segment"); + } + } + + if (!row_is_valid) { + AddNull(); + } else if (new_string) { + AddNewString(data[idx]); + } else { + AddLastLookup(); + } + + Verify(); + } + + return true; +} + +} // namespace dict_fsst +} // namespace duckdb diff --git a/src/storage/compression/dict_fsst/compression.cpp b/src/storage/compression/dict_fsst/compression.cpp new file mode 100644 index 000000000000..5241e36f1ebb --- /dev/null +++ b/src/storage/compression/dict_fsst/compression.cpp @@ -0,0 +1,174 @@ +#include "duckdb/storage/compression/dict_fsst/compression.hpp" +#include "duckdb/storage/segment/uncompressed.hpp" + +namespace duckdb { +namespace dict_fsst { + +DictFSSTCompressionCompressState::DictFSSTCompressionCompressState(ColumnDataCheckpointData &checkpoint_data_p, + const CompressionInfo &info) + : DictFSSTCompressionState(info), checkpoint_data(checkpoint_data_p), + function(checkpoint_data.GetCompressionFunction(CompressionType::COMPRESSION_DICT_FSST)), + heap(BufferAllocator::Get(checkpoint_data.GetDatabase())) { + CreateEmptySegment(checkpoint_data.GetRowGroup().start); +} + +void DictFSSTCompressionCompressState::CreateEmptySegment(idx_t row_start) { + auto &db = checkpoint_data.GetDatabase(); + auto &type = checkpoint_data.GetType(); + + auto compressed_segment = + ColumnSegment::CreateTransientSegment(db, function, type, row_start, info.GetBlockSize(), info.GetBlockSize()); + current_segment = std::move(compressed_segment); + + // Reset the buffers and the string map. + current_string_map.clear(); + index_buffer.clear(); + + // Reserve index 0 for null strings. + index_buffer.push_back(0); + selection_buffer.clear(); + + current_width = 0; + next_width = 0; + + // Reset the pointers into the current segment. + auto &buffer_manager = BufferManager::GetBufferManager(checkpoint_data.GetDatabase()); + current_handle = buffer_manager.Pin(current_segment->block); + current_dictionary = DictFSSTCompression::GetDictionary(*current_segment, current_handle); + current_end_ptr = current_handle.Ptr() + current_dictionary.end; +} + +void DictFSSTCompressionCompressState::Verify() { + current_dictionary.Verify(info.GetBlockSize()); + D_ASSERT(current_segment->count == selection_buffer.size()); + D_ASSERT(DictFSSTCompression::HasEnoughSpace(current_segment->count.load(), index_buffer.size(), + current_dictionary.size, current_width, info.GetBlockSize())); + D_ASSERT(current_dictionary.end == info.GetBlockSize()); + D_ASSERT(index_buffer.size() == current_string_map.size() + 1); // +1 is for null value +} + +bool DictFSSTCompressionCompressState::LookupString(string_t str) { + auto search = current_string_map.find(str); + auto has_result = search != current_string_map.end(); + + if (has_result) { + latest_lookup_result = search->second; + } + return has_result; +} + +void DictFSSTCompressionCompressState::AddNewString(string_t str) { + UncompressedStringStorage::UpdateStringStats(current_segment->stats, str); + + // Copy string to dict + current_dictionary.size += str.GetSize(); + auto dict_pos = current_end_ptr - current_dictionary.size; + memcpy(dict_pos, str.GetData(), str.GetSize()); + current_dictionary.Verify(info.GetBlockSize()); + D_ASSERT(current_dictionary.end == info.GetBlockSize()); + + // Update buffers and map + index_buffer.push_back(current_dictionary.size); + selection_buffer.push_back(UnsafeNumericCast(index_buffer.size() - 1)); + if (str.IsInlined()) { + current_string_map.insert({str, index_buffer.size() - 1}); + } else { + current_string_map.insert({heap.AddBlob(str), index_buffer.size() - 1}); + } + DictFSSTCompression::SetDictionary(*current_segment, current_handle, current_dictionary); + + current_width = next_width; + current_segment->count++; +} + +void DictFSSTCompressionCompressState::AddNull() { + selection_buffer.push_back(0); + current_segment->count++; +} + +void DictFSSTCompressionCompressState::AddLastLookup() { + selection_buffer.push_back(latest_lookup_result); + current_segment->count++; +} + +bool DictFSSTCompressionCompressState::CalculateSpaceRequirements(bool new_string, idx_t string_size) { + if (!new_string) { + return DictFSSTCompression::HasEnoughSpace(current_segment->count.load() + 1, index_buffer.size(), + current_dictionary.size, current_width, info.GetBlockSize()); + } + next_width = BitpackingPrimitives::MinimumBitWidth(index_buffer.size() - 1 + new_string); + return DictFSSTCompression::HasEnoughSpace(current_segment->count.load() + 1, index_buffer.size() + 1, + current_dictionary.size + string_size, next_width, info.GetBlockSize()); +} + +void DictFSSTCompressionCompressState::Flush(bool final) { + auto next_start = current_segment->start + current_segment->count; + + auto segment_size = Finalize(); + auto &state = checkpoint_data.GetCheckpointState(); + state.FlushSegment(std::move(current_segment), std::move(current_handle), segment_size); + + if (!final) { + CreateEmptySegment(next_start); + } +} + +idx_t DictFSSTCompressionCompressState::Finalize() { + auto &buffer_manager = BufferManager::GetBufferManager(checkpoint_data.GetDatabase()); + auto handle = buffer_manager.Pin(current_segment->block); + D_ASSERT(current_dictionary.end == info.GetBlockSize()); + + // calculate sizes + auto compressed_selection_buffer_size = + BitpackingPrimitives::GetRequiredSize(current_segment->count, current_width); + auto index_buffer_size = index_buffer.size() * sizeof(uint32_t); + auto total_size = DictFSSTCompression::DICTIONARY_HEADER_SIZE + compressed_selection_buffer_size + + index_buffer_size + current_dictionary.size; + + // calculate ptr and offsets + auto base_ptr = handle.Ptr(); + auto header_ptr = reinterpret_cast(base_ptr); + auto compressed_selection_buffer_offset = DictFSSTCompression::DICTIONARY_HEADER_SIZE; + auto index_buffer_offset = compressed_selection_buffer_offset + compressed_selection_buffer_size; + + // Write compressed selection buffer + BitpackingPrimitives::PackBuffer(base_ptr + compressed_selection_buffer_offset, + (sel_t *)(selection_buffer.data()), current_segment->count, + current_width); + + // Write the index buffer + memcpy(base_ptr + index_buffer_offset, index_buffer.data(), index_buffer_size); + + // Store sizes and offsets in segment header + Store(NumericCast(index_buffer_offset), data_ptr_cast(&header_ptr->index_buffer_offset)); + Store(NumericCast(index_buffer.size()), data_ptr_cast(&header_ptr->index_buffer_count)); + Store((uint32_t)current_width, data_ptr_cast(&header_ptr->bitpacking_width)); + + D_ASSERT(current_width == BitpackingPrimitives::MinimumBitWidth(index_buffer.size() - 1)); + D_ASSERT(DictFSSTCompression::HasEnoughSpace(current_segment->count, index_buffer.size(), current_dictionary.size, + current_width, info.GetBlockSize())); + D_ASSERT((uint64_t)*max_element(std::begin(selection_buffer), std::end(selection_buffer)) == + index_buffer.size() - 1); + + // Early-out, if the block is sufficiently full. + if (total_size >= info.GetCompactionFlushLimit()) { + return info.GetBlockSize(); + } + + // Sufficient space: calculate how much space we can save. + auto move_amount = info.GetBlockSize() - total_size; + + // Move the dictionary to align it with the offsets. + auto new_dictionary_offset = index_buffer_offset + index_buffer_size; + memmove(base_ptr + new_dictionary_offset, base_ptr + current_dictionary.end - current_dictionary.size, + current_dictionary.size); + current_dictionary.end -= move_amount; + D_ASSERT(current_dictionary.end == total_size); + + // Write the new dictionary with the updated "end". + DictFSSTCompression::SetDictionary(*current_segment, handle, current_dictionary); + return total_size; +} + +} // namespace dict_fsst +} // namespace duckdb diff --git a/src/storage/compression/dict_fsst/decompression.cpp b/src/storage/compression/dict_fsst/decompression.cpp new file mode 100644 index 000000000000..34cefcd02f25 --- /dev/null +++ b/src/storage/compression/dict_fsst/decompression.cpp @@ -0,0 +1,126 @@ +#include "duckdb/storage/compression/dict_fsst/decompression.hpp" + +namespace duckdb { +namespace dict_fsst { + +uint16_t CompressedStringScanState::GetStringLength(sel_t index) { + if (index == 0) { + return 0; + } else { + return UnsafeNumericCast(index_buffer_ptr[index] - index_buffer_ptr[index - 1]); + } +} + +string_t CompressedStringScanState::FetchStringFromDict(int32_t dict_offset, uint16_t string_len) { + D_ASSERT(dict_offset >= 0 && dict_offset <= NumericCast(block_size)); + if (dict_offset == 0) { + return string_t(nullptr, 0); + } + + // normal string: read string from this block + auto dict_end = baseptr + dict.end; + auto dict_pos = dict_end - dict_offset; + + auto str_ptr = char_ptr_cast(dict_pos); + return string_t(str_ptr, string_len); +} + +void CompressedStringScanState::Initialize(ColumnSegment &segment, bool initialize_dictionary) { + baseptr = handle->Ptr() + segment.GetBlockOffset(); + + // Load header values + auto header_ptr = reinterpret_cast(baseptr); + auto index_buffer_offset = Load(data_ptr_cast(&header_ptr->index_buffer_offset)); + index_buffer_count = Load(data_ptr_cast(&header_ptr->index_buffer_count)); + current_width = (bitpacking_width_t)(Load(data_ptr_cast(&header_ptr->bitpacking_width))); + if (segment.GetBlockOffset() + index_buffer_offset + sizeof(uint32_t) * index_buffer_count > + segment.GetBlockManager().GetBlockSize()) { + throw IOException( + "Failed to scan dictionary string - index was out of range. Database file appears to be corrupted."); + } + index_buffer_ptr = reinterpret_cast(baseptr + index_buffer_offset); + base_data = data_ptr_cast(baseptr + DictFSSTCompression::DICTIONARY_HEADER_SIZE); + + block_size = segment.GetBlockManager().GetBlockSize(); + + dict = DictFSSTCompression::GetDictionary(segment, *handle); + + if (!initialize_dictionary) { + // Used by fetch, as fetch will never produce a DictionaryVector + return; + } + + dictionary = make_buffer(segment.type, index_buffer_count); + dictionary_size = index_buffer_count; + auto dict_child_data = FlatVector::GetData(*(dictionary)); + auto &validity = FlatVector::Validity(*dictionary); + D_ASSERT(index_buffer_count >= 1); + validity.SetInvalid(0); + for (uint32_t i = 0; i < index_buffer_count; i++) { + // NOTE: the passing of dict_child_vector, will not be used, its for big strings + uint16_t str_len = GetStringLength(i); + dict_child_data[i] = FetchStringFromDict(UnsafeNumericCast(index_buffer_ptr[i]), str_len); + } +} + +void CompressedStringScanState::ScanToFlatVector(Vector &result, idx_t result_offset, idx_t start, idx_t scan_count) { + auto result_data = FlatVector::GetData(result); + auto &validity = FlatVector::Validity(result); + + // Handling non-bitpacking-group-aligned start values; + idx_t start_offset = start % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE; + + // We will scan in blocks of BITPACKING_ALGORITHM_GROUP_SIZE, so we may scan some extra values. + idx_t decompress_count = BitpackingPrimitives::RoundUpToAlgorithmGroupSize(scan_count + start_offset); + + // Create a decompression buffer of sufficient size if we don't already have one. + if (!sel_vec || sel_vec_size < decompress_count) { + sel_vec_size = decompress_count; + sel_vec = make_buffer(decompress_count); + } + + data_ptr_t src = &base_data[((start - start_offset) * current_width) / 8]; + sel_t *sel_vec_ptr = sel_vec->data(); + + BitpackingPrimitives::UnPackBuffer(data_ptr_cast(sel_vec_ptr), src, decompress_count, current_width); + + for (idx_t i = 0; i < scan_count; i++) { + // Lookup dict offset in index buffer + auto string_number = sel_vec->get_index(i + start_offset); + if (string_number == 0) { + validity.SetInvalid(result_offset + i); + } + auto dict_offset = index_buffer_ptr[string_number]; + auto str_len = GetStringLength(UnsafeNumericCast(string_number)); + result_data[result_offset + i] = FetchStringFromDict(UnsafeNumericCast(dict_offset), str_len); + } +} + +void CompressedStringScanState::ScanToDictionaryVector(ColumnSegment &segment, Vector &result, idx_t result_offset, + idx_t start, idx_t scan_count) { + D_ASSERT(start % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0); + D_ASSERT(scan_count == STANDARD_VECTOR_SIZE); + D_ASSERT(result_offset == 0); + + idx_t decompress_count = BitpackingPrimitives::RoundUpToAlgorithmGroupSize(scan_count); + + // Create a selection vector of sufficient size if we don't already have one. + if (!sel_vec || sel_vec_size < decompress_count) { + sel_vec_size = decompress_count; + sel_vec = make_buffer(decompress_count); + } + + // Scanning 2048 values, emitting a dict vector + data_ptr_t dst = data_ptr_cast(sel_vec->data()); + data_ptr_t src = data_ptr_cast(&base_data[(start * current_width) / 8]); + + BitpackingPrimitives::UnPackBuffer(dst, src, scan_count, current_width); + + result.Dictionary(*(dictionary), dictionary_size, *sel_vec, scan_count); + // FIXME: this assumes the type is VectorType::DICTIONARY + // this could fail if the result is a ConstantVector instead. + DictionaryVector::SetDictionaryId(result, to_string(CastPointerToValue(&segment))); +} + +} // namespace dict_fsst +} // namespace duckdb diff --git a/src/storage/compression/dictionary/analyze.cpp b/src/storage/compression/dictionary/analyze.cpp index 3d12bc2e1560..ddb0cbdad768 100644 --- a/src/storage/compression/dictionary/analyze.cpp +++ b/src/storage/compression/dictionary/analyze.cpp @@ -1,6 +1,7 @@ #include "duckdb/storage/compression/dictionary/analyze.hpp" namespace duckdb { +namespace dictionary { DictionaryAnalyzeState::DictionaryAnalyzeState(const CompressionInfo &info) : DictionaryCompressionState(info), segment_count(0), current_tuple_count(0), current_unique_count(0), @@ -51,4 +52,5 @@ void DictionaryAnalyzeState::Flush(bool final) { void DictionaryAnalyzeState::Verify() { } +} // namespace dictionary } // namespace duckdb diff --git a/src/storage/compression/dictionary/common.cpp b/src/storage/compression/dictionary/common.cpp index 05be9aaed95f..3ab375308753 100644 --- a/src/storage/compression/dictionary/common.cpp +++ b/src/storage/compression/dictionary/common.cpp @@ -1,6 +1,7 @@ #include "duckdb/storage/compression/dictionary/common.hpp" namespace duckdb { +namespace dictionary { //===--------------------------------------------------------------------===// // Helper Functions @@ -87,4 +88,5 @@ bool DictionaryCompressionState::UpdateState(Vector &scan_vector, idx_t count) { return true; } +} // namespace dictionary } // namespace duckdb diff --git a/src/storage/compression/dictionary/compression.cpp b/src/storage/compression/dictionary/compression.cpp index 064697fc7c54..af9f100e9246 100644 --- a/src/storage/compression/dictionary/compression.cpp +++ b/src/storage/compression/dictionary/compression.cpp @@ -2,6 +2,7 @@ #include "duckdb/storage/segment/uncompressed.hpp" namespace duckdb { +namespace dictionary { DictionaryCompressionCompressState::DictionaryCompressionCompressState(ColumnDataCheckpointData &checkpoint_data_p, const CompressionInfo &info) @@ -170,4 +171,5 @@ idx_t DictionaryCompressionCompressState::Finalize() { return total_size; } +} // namespace dictionary } // namespace duckdb diff --git a/src/storage/compression/dictionary/decompression.cpp b/src/storage/compression/dictionary/decompression.cpp index dd1031328a63..065865fabb7d 100644 --- a/src/storage/compression/dictionary/decompression.cpp +++ b/src/storage/compression/dictionary/decompression.cpp @@ -1,6 +1,7 @@ #include "duckdb/storage/compression/dictionary/decompression.hpp" namespace duckdb { +namespace dictionary { uint16_t CompressedStringScanState::GetStringLength(sel_t index) { if (index == 0) { @@ -121,4 +122,5 @@ void CompressedStringScanState::ScanToDictionaryVector(ColumnSegment &segment, V DictionaryVector::SetDictionaryId(result, to_string(CastPointerToValue(&segment))); } +} // namespace dictionary } // namespace duckdb diff --git a/src/storage/compression/dictionary_compression.cpp b/src/storage/compression/dictionary_compression.cpp index 78915a374288..379e83bd5107 100644 --- a/src/storage/compression/dictionary_compression.cpp +++ b/src/storage/compression/dictionary_compression.cpp @@ -46,6 +46,7 @@ Data layout per segment: */ namespace duckdb { +namespace dictionary { struct DictionaryCompressionStorage { static unique_ptr StringInitAnalyze(ColumnData &col_data, PhysicalType type); @@ -153,17 +154,21 @@ void DictionaryCompressionStorage::StringFetchRow(ColumnSegment &segment, Column scan_state.ScanToFlatVector(result, result_idx, NumericCast(row_id), 1); } +} // namespace dictionary + //===--------------------------------------------------------------------===// // Get Function //===--------------------------------------------------------------------===// CompressionFunction DictionaryCompressionFun::GetFunction(PhysicalType data_type) { auto res = CompressionFunction( - CompressionType::COMPRESSION_DICTIONARY, data_type, DictionaryCompressionStorage ::StringInitAnalyze, - DictionaryCompressionStorage::StringAnalyze, DictionaryCompressionStorage::StringFinalAnalyze, - DictionaryCompressionStorage::InitCompression, DictionaryCompressionStorage::Compress, - DictionaryCompressionStorage::FinalizeCompress, DictionaryCompressionStorage::StringInitScan, - DictionaryCompressionStorage::StringScan, DictionaryCompressionStorage::StringScanPartial, - DictionaryCompressionStorage::StringFetchRow, UncompressedFunctions::EmptySkip, + CompressionType::COMPRESSION_DICTIONARY, data_type, dictionary::DictionaryCompressionStorage::StringInitAnalyze, + dictionary::DictionaryCompressionStorage::StringAnalyze, + dictionary::DictionaryCompressionStorage::StringFinalAnalyze, + dictionary::DictionaryCompressionStorage::InitCompression, dictionary::DictionaryCompressionStorage::Compress, + dictionary::DictionaryCompressionStorage::FinalizeCompress, + dictionary::DictionaryCompressionStorage::StringInitScan, dictionary::DictionaryCompressionStorage::StringScan, + dictionary::DictionaryCompressionStorage::StringScanPartial, + dictionary::DictionaryCompressionStorage::StringFetchRow, UncompressedFunctions::EmptySkip, UncompressedStringStorage::StringInitSegment); res.validity = CompressionValidity::NO_VALIDITY_REQUIRED; return res;