From 70d811554dc9f484d240f53a8532d74feea04b43 Mon Sep 17 00:00:00 2001 From: Tishj Date: Fri, 17 Jan 2025 15:16:38 +0100 Subject: [PATCH] slowly getting there, FSST_ONLY is currently still broken --- .../storage/compression/dict_fsst/common.hpp | 4 -- .../compression/dict_fsst/compression.hpp | 2 +- .../compression/dict_fsst/compression.cpp | 61 +++++++++++-------- .../compression/dict_fsst/decompression.cpp | 10 ++- .../compression/dict_fsst/dict_fsst_test.test | 2 +- 5 files changed, 45 insertions(+), 34 deletions(-) diff --git a/src/include/duckdb/storage/compression/dict_fsst/common.hpp b/src/include/duckdb/storage/compression/dict_fsst/common.hpp index fcaa70e74a1d..46af3ced96ac 100644 --- a/src/include/duckdb/storage/compression/dict_fsst/common.hpp +++ b/src/include/duckdb/storage/compression/dict_fsst/common.hpp @@ -38,10 +38,6 @@ struct DictFSSTCompression { //! Dictionary header size at the beginning of the string segment (offset + length) static constexpr uint16_t DICTIONARY_HEADER_SIZE = sizeof(dict_fsst_compression_header_t); static constexpr idx_t STRING_SIZE_LIMIT = 16384; - -public: - static StringDictionaryContainer GetDictionary(ColumnSegment &segment, BufferHandle &handle); - static void SetDictionary(ColumnSegment &segment, BufferHandle &handle, StringDictionaryContainer container); }; } // namespace dict_fsst diff --git a/src/include/duckdb/storage/compression/dict_fsst/compression.hpp b/src/include/duckdb/storage/compression/dict_fsst/compression.hpp index 94ec7799c2ea..f9c4fca83d60 100644 --- a/src/include/duckdb/storage/compression/dict_fsst/compression.hpp +++ b/src/include/duckdb/storage/compression/dict_fsst/compression.hpp @@ -89,11 +89,11 @@ struct DictFSSTCompressionState : public CompressionState { idx_t tuple_count = 0; unique_ptr analyze; bool all_unique = true; + idx_t symbol_table_size = DConstants::INVALID_INDEX; private: void *encoder = nullptr; unsafe_unique_array fsst_serialized_symbol_table; - idx_t symbol_table_size = DConstants::INVALID_INDEX; DictionaryAppendState append_state = DictionaryAppendState::REGULAR; }; diff --git a/src/storage/compression/dict_fsst/compression.cpp b/src/storage/compression/dict_fsst/compression.cpp index f4f16be57e11..faa8ca02f10f 100644 --- a/src/storage/compression/dict_fsst/compression.cpp +++ b/src/storage/compression/dict_fsst/compression.cpp @@ -7,9 +7,10 @@ namespace duckdb { namespace dict_fsst { DictFSSTCompressionState::DictFSSTCompressionState(ColumnDataCheckpointData &checkpoint_data_p, - unique_ptr &&analyze) - : CompressionState(analyze->info), checkpoint_data(checkpoint_data_p), - function(checkpoint_data.GetCompressionFunction(CompressionType::COMPRESSION_DICT_FSST)) { + unique_ptr &&analyze_p) + : CompressionState(analyze_p->info), checkpoint_data(checkpoint_data_p), + function(checkpoint_data.GetCompressionFunction(CompressionType::COMPRESSION_DICT_FSST)), + analyze(std::move(analyze_p)) { CreateEmptySegment(checkpoint_data.GetRowGroup().start); } @@ -21,7 +22,6 @@ DictFSSTCompressionState::~DictFSSTCompressionState() { } static constexpr uint16_t DICTIONARY_HEADER_SIZE = sizeof(dict_fsst_compression_header_t); -static constexpr idx_t STRING_SIZE_LIMIT = 16384; static constexpr uint16_t FSST_SYMBOL_TABLE_SIZE = sizeof(duckdb_fsst_decoder_t); static constexpr idx_t DICTIONARY_ENCODE_THRESHOLD = 4096; @@ -69,6 +69,23 @@ idx_t DictFSSTCompressionState::Finalize() { auto string_lengths_dest = AlignValue(symbol_table_dest + symbol_table_size); auto dictionary_indices_dest = AlignValue(string_lengths_dest + string_lengths_space); +#ifdef DEBUG + idx_t required_space = 0; + required_space += sizeof(dict_fsst_compression_header_t); + required_space = AlignValue(required_space); + required_space += dictionary_offset; + required_space = AlignValue(required_space); + if (is_fsst_encoded) { + required_space += symbol_table_size; + } + required_space = AlignValue(required_space); + required_space += string_lengths_space; + required_space = AlignValue(required_space); + required_space += dictionary_indices_space; + + D_ASSERT(info.GetBlockSize() >= required_space); +#endif + header_ptr->mode = ConvertToMode(append_state); header_ptr->symbol_table_size = symbol_table_size; header_ptr->dict_size = dictionary_offset; @@ -94,22 +111,6 @@ idx_t DictFSSTCompressionState::Finalize() { auto expected_bitwidth = BitpackingPrimitives::MinimumBitWidth(dict_count - 1); D_ASSERT(dictionary_indices_width == expected_bitwidth); } - -#ifdef DEBUG - idx_t required_space = 0; - required_space += sizeof(dict_fsst_compression_header_t); - required_space = AlignValue(required_space); - required_space += dictionary_offset; - if (is_fsst_encoded) { - required_space += symbol_table_size; - } - required_space = AlignValue(required_space); - required_space += dictionary_indices_space; - required_space = AlignValue(required_space); - required_space += string_lengths_space; - - D_ASSERT(info.GetBlockSize() >= required_space); -#endif D_ASSERT((uint64_t)*max_element(std::begin(dictionary_indices), std::end(dictionary_indices)) == dict_count - 1); return total_size; } @@ -163,7 +164,8 @@ void DictFSSTCompressionState::FlushEncodingBuffer() { string_lengths_width = BitpackingPrimitives::MinimumBitWidth(biggest_strlen); } real_string_lengths_width = string_lengths_width; - string_lengths_space = BitpackingPrimitives::GetRequiredSize(dict_count, string_lengths_space); + string_lengths_space = BitpackingPrimitives::GetRequiredSize(dict_count, string_lengths_width); + D_ASSERT(string_lengths_space != 0); to_encode_string_sum = 0; dictionary_encoding_buffer.clear(); } @@ -214,10 +216,6 @@ void DictFSSTCompressionState::Flush(bool final) { auto &state = checkpoint_data.GetCheckpointState(); state.FlushSegment(std::move(current_segment), std::move(current_handle), segment_size); - if (!final) { - CreateEmptySegment(next_start); - } - // Reset the state uncompressed_dictionary_copy.Destroy(); //! This should already be empty at this point, otherwise that means that strings are not encoded / not added to the @@ -234,6 +232,10 @@ void DictFSSTCompressionState::Flush(bool final) { encoder = nullptr; symbol_table_size = DConstants::INVALID_INDEX; } + + if (!final) { + CreateEmptySegment(next_start); + } } static inline bool RequiresHigherBitWidth(bitpacking_width_t bitwidth, uint32_t other) { @@ -261,6 +263,10 @@ static inline bool AddLookup(DictFSSTCompressionState &state, idx_t lookup, cons required_space += state.dictionary_offset; } required_space = AlignValue(required_space); + if (IsEncoded(APPEND_STATE)) { + required_space += state.symbol_table_size; + required_space = AlignValue(required_space); + } required_space += new_dictionary_indices_space; required_space = AlignValue(required_space); required_space += state.string_lengths_space; @@ -332,6 +338,10 @@ static inline bool AddToDictionary(DictFSSTCompressionState &state, const string required_space += state.dictionary_offset + str_len; } required_space = AlignValue(required_space); + if (IsEncoded(APPEND_STATE)) { + required_space += state.symbol_table_size; + required_space = AlignValue(required_space); + } required_space += new_dictionary_indices_space; required_space = AlignValue(required_space); required_space += new_string_lengths_space; @@ -374,6 +384,7 @@ static inline bool AddToDictionary(DictFSSTCompressionState &state, const string } if (requires_higher_strlen_bitwidth || recalculate_strlen_space) { state.string_lengths_space = new_string_lengths_space; + D_ASSERT(state.string_lengths_space != 0); } if (requires_higher_indices_bitwidth) { diff --git a/src/storage/compression/dict_fsst/decompression.cpp b/src/storage/compression/dict_fsst/decompression.cpp index 8fb0c4ab448f..33213f861883 100644 --- a/src/storage/compression/dict_fsst/decompression.cpp +++ b/src/storage/compression/dict_fsst/decompression.cpp @@ -77,8 +77,10 @@ void CompressedStringScanState::Initialize(bool initialize_dictionary) { (void)(ret); D_ASSERT(ret != 0); // FIXME: the old code set the decoder to nullptr instead, why??? - auto string_block_limit = StringUncompressed::GetStringBlockLimit(segment.GetBlockManager().GetBlockSize()); - decompress_buffer.resize(string_block_limit + 1); + //! The biggest string_length covered by the 'string_lengths_width' + uint32_t max_string_length = (1 << string_lengths_width) - 1; + auto buffer_size = (max_string_length * 8) + 1; + decompress_buffer.resize(buffer_size); break; } default: @@ -102,8 +104,10 @@ void CompressedStringScanState::Initialize(bool initialize_dictionary) { int32_t offset = 0; for (uint32_t i = 0; i < dict_count; i++) { + //! We can uncompress during fetching, we need the length of the string inside the dictionary + auto string_len = string_lengths[i]; dict_child_data[i] = FetchStringFromDict(*dictionary, offset, i); - offset += dict_child_data[i].GetSize(); + offset += string_len; } } diff --git a/test/sql/storage/compression/dict_fsst/dict_fsst_test.test b/test/sql/storage/compression/dict_fsst/dict_fsst_test.test index 2db9d0f5bcb4..fb12d5d140db 100644 --- a/test/sql/storage/compression/dict_fsst/dict_fsst_test.test +++ b/test/sql/storage/compression/dict_fsst/dict_fsst_test.test @@ -13,7 +13,7 @@ select (i % 200)::INTEGER::VARCHAR, 2047 // len((i % 200)::INTEGER::VARCHAR) ) a -from range(20000) t(i); +from range(400) t(i); statement ok checkpoint;