diff --git a/src/include/duckdb/storage/compression/dict_fsst/analyze.hpp b/src/include/duckdb/storage/compression/dict_fsst/analyze.hpp index 750880ac0298..05ab08e85706 100644 --- a/src/include/duckdb/storage/compression/dict_fsst/analyze.hpp +++ b/src/include/duckdb/storage/compression/dict_fsst/analyze.hpp @@ -22,6 +22,8 @@ struct DictFSSTAnalyzeState : public DictFSSTCompressionState { void AddNull() override; bool HasRoomForString(bool new_string, idx_t string_size) override; void Flush(bool final = false) override; + void ProcessStrings(UnifiedVectorFormat &input, idx_t count) override; + const string_t &GetString(const string_t *strings, idx_t index, idx_t raw_index) override; void Verify() override; public: diff --git a/src/include/duckdb/storage/compression/dict_fsst/common.hpp b/src/include/duckdb/storage/compression/dict_fsst/common.hpp index e28e618c06b8..8b0d8f1c8a59 100644 --- a/src/include/duckdb/storage/compression/dict_fsst/common.hpp +++ b/src/include/duckdb/storage/compression/dict_fsst/common.hpp @@ -58,6 +58,10 @@ class DictFSSTCompressionState : public CompressionState { virtual bool HasRoomForString(bool new_string, idx_t string_size) = 0; // Flush the segment to disk if compressing or reset the counters if analyzing virtual void Flush(bool final = false) = 0; + // Process the strings of the vector if necessary + virtual void ProcessStrings(UnifiedVectorFormat &input, idx_t count) = 0; + // Retrieve the string given the indices + virtual const string_t &GetString(const string_t *strings, idx_t index, idx_t raw_index) = 0; protected: //! Whether the dictionary has been encoded with FSST diff --git a/src/include/duckdb/storage/compression/dict_fsst/compression.hpp b/src/include/duckdb/storage/compression/dict_fsst/compression.hpp index 7dd0d546b9ae..f8cc37ac1abc 100644 --- a/src/include/duckdb/storage/compression/dict_fsst/compression.hpp +++ b/src/include/duckdb/storage/compression/dict_fsst/compression.hpp @@ -35,6 +35,8 @@ struct DictFSSTCompressionCompressState : public DictFSSTCompressionState { void AddLookup(uint32_t lookup_result) override; bool HasRoomForString(bool new_string, idx_t string_size) override; void Flush(bool final = false) override; + void ProcessStrings(UnifiedVectorFormat &input, idx_t count) override; + const string_t &GetString(const string_t *strings, idx_t index, idx_t raw_index) override; idx_t Finalize(); public: diff --git a/src/storage/compression/dict_fsst/analyze.cpp b/src/storage/compression/dict_fsst/analyze.cpp index 953d0b9d3dab..2b102052fe0f 100644 --- a/src/storage/compression/dict_fsst/analyze.cpp +++ b/src/storage/compression/dict_fsst/analyze.cpp @@ -32,6 +32,14 @@ void DictFSSTAnalyzeState::AddNull() { current_tuple_count++; } +void DictFSSTAnalyzeState::ProcessStrings(UnifiedVectorFormat &input, idx_t count) { + return; +} + +const string_t &DictFSSTAnalyzeState::GetString(const string_t *strings, idx_t index, idx_t raw_index) { + return strings[index]; +} + bool DictFSSTAnalyzeState::HasRoomForString(bool new_string, idx_t string_size) { if (!new_string) { return DictFSSTCompression::HasEnoughSpace(current_tuple_count + 1, current_unique_count, current_dict_size, diff --git a/src/storage/compression/dict_fsst/common.cpp b/src/storage/compression/dict_fsst/common.cpp index 0012828f4a7e..cf2853eea49f 100644 --- a/src/storage/compression/dict_fsst/common.cpp +++ b/src/storage/compression/dict_fsst/common.cpp @@ -48,19 +48,22 @@ bool DictFSSTCompressionState::UpdateState(Vector &scan_vector, idx_t count) { auto data = UnifiedVectorFormat::GetData(vdata); Verify(); + ProcessStrings(vdata, count); + for (idx_t i = 0; i < count; i++) { auto idx = vdata.sel->get_index(i); idx_t string_size = 0; optional_idx lookup_result; auto row_is_valid = vdata.validity.RowIsValid(idx); + auto &str = GetString(data, idx, i); if (row_is_valid) { - string_size = data[idx].GetSize(); + string_size = str.GetSize(); if (string_size >= StringUncompressed::GetStringBlockLimit(info.GetBlockSize())) { // Big strings not implemented for dictionary compression return false; } - lookup_result = LookupString(data[idx]); + lookup_result = LookupString(str); } bool new_string = !lookup_result.IsValid(); @@ -81,7 +84,7 @@ bool DictFSSTCompressionState::UpdateState(Vector &scan_vector, idx_t count) { } else if (lookup_result.IsValid()) { AddLookup(UnsafeNumericCast(lookup_result.GetIndex())); } else { - AddNewString(data[idx]); + AddNewString(str); } Verify(); diff --git a/src/storage/compression/dict_fsst/compression.cpp b/src/storage/compression/dict_fsst/compression.cpp index f78ee3f6f4c0..3e228f4eba78 100644 --- a/src/storage/compression/dict_fsst/compression.cpp +++ b/src/storage/compression/dict_fsst/compression.cpp @@ -124,6 +124,23 @@ void DictFSSTCompressionCompressState::Flush(bool final) { } } +void DictFSSTCompressionCompressState::ProcessStrings(UnifiedVectorFormat &input, idx_t count) { + if (!fsst_encoded) { + // No need to process anything + return; + } + throw NotImplementedException("FSST ENCODED PROCESS STRINGS"); + // TODO: perform fsst encoding on the provided strings +} + +const string_t &DictFSSTCompressionCompressState::GetString(const string_t *strings, idx_t index, idx_t raw_index) { + if (!fsst_encoded) { + return strings[index]; + } + throw NotImplementedException("FSST ENCODED GET STRING"); + // TODO: look up the encoded string given the 'raw_index' +} + idx_t DictFSSTCompressionCompressState::Finalize() { auto &buffer_manager = BufferManager::GetBufferManager(checkpoint_data.GetDatabase()); auto handle = buffer_manager.Pin(current_segment->block); diff --git a/test/sql/storage/compression/dictionary/fetch_row.test b/test/sql/storage/compression/dictionary/fetch_row.test index f2202ed44a4d..b0e3852d3580 100644 --- a/test/sql/storage/compression/dictionary/fetch_row.test +++ b/test/sql/storage/compression/dictionary/fetch_row.test @@ -8,7 +8,7 @@ require block_size 262144 load __TEST_DIR__/test_dictionary_fetchrow.db statement ok -PRAGMA force_compression = 'dictionary' +PRAGMA force_compression = 'dict_fsst' statement ok CREATE TABLE test ( diff --git a/test/sql/storage/compression/dictionary/force_dictionary.test b/test/sql/storage/compression/dictionary/force_dictionary.test index e415dab25c56..575b4641e392 100644 --- a/test/sql/storage/compression/dictionary/force_dictionary.test +++ b/test/sql/storage/compression/dictionary/force_dictionary.test @@ -7,7 +7,7 @@ require vector_size 2048 load __TEST_DIR__/force_dictionary.db statement ok -PRAGMA force_compression = 'dictionary' +PRAGMA force_compression = 'dict_fsst'; statement ok CREATE TABLE test_dict (a VARCHAR); @@ -21,4 +21,4 @@ CHECKPOINT query I SELECT compression FROM pragma_storage_info('test_dict') WHERE segment_type ILIKE 'VARCHAR' LIMIT 1 ---- -Dictionary +Dict FSST