Skip to content

Commit

Permalink
prepare for the need to encode input strings once the dictionary is f…
Browse files Browse the repository at this point in the history
…sst encoded
  • Loading branch information
Tishj committed Jan 7, 2025
1 parent d924e35 commit 468db86
Show file tree
Hide file tree
Showing 8 changed files with 42 additions and 6 deletions.
2 changes: 2 additions & 0 deletions src/include/duckdb/storage/compression/dict_fsst/analyze.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ struct DictFSSTAnalyzeState : public DictFSSTCompressionState {
void AddNull() override;
bool HasRoomForString(bool new_string, idx_t string_size) override;
void Flush(bool final = false) override;
void ProcessStrings(UnifiedVectorFormat &input, idx_t count) override;
const string_t &GetString(const string_t *strings, idx_t index, idx_t raw_index) override;
void Verify() override;

public:
Expand Down
4 changes: 4 additions & 0 deletions src/include/duckdb/storage/compression/dict_fsst/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ class DictFSSTCompressionState : public CompressionState {
virtual bool HasRoomForString(bool new_string, idx_t string_size) = 0;
// Flush the segment to disk if compressing or reset the counters if analyzing
virtual void Flush(bool final = false) = 0;
// Process the strings of the vector if necessary
virtual void ProcessStrings(UnifiedVectorFormat &input, idx_t count) = 0;
// Retrieve the string given the indices
virtual const string_t &GetString(const string_t *strings, idx_t index, idx_t raw_index) = 0;

protected:
//! Whether the dictionary has been encoded with FSST
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ struct DictFSSTCompressionCompressState : public DictFSSTCompressionState {
void AddLookup(uint32_t lookup_result) override;
bool HasRoomForString(bool new_string, idx_t string_size) override;
void Flush(bool final = false) override;
void ProcessStrings(UnifiedVectorFormat &input, idx_t count) override;
const string_t &GetString(const string_t *strings, idx_t index, idx_t raw_index) override;
idx_t Finalize();

public:
Expand Down
8 changes: 8 additions & 0 deletions src/storage/compression/dict_fsst/analyze.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ void DictFSSTAnalyzeState::AddNull() {
current_tuple_count++;
}

void DictFSSTAnalyzeState::ProcessStrings(UnifiedVectorFormat &input, idx_t count) {
return;
}

const string_t &DictFSSTAnalyzeState::GetString(const string_t *strings, idx_t index, idx_t raw_index) {
return strings[index];
}

bool DictFSSTAnalyzeState::HasRoomForString(bool new_string, idx_t string_size) {
if (!new_string) {
return DictFSSTCompression::HasEnoughSpace(current_tuple_count + 1, current_unique_count, current_dict_size,
Expand Down
9 changes: 6 additions & 3 deletions src/storage/compression/dict_fsst/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,22 @@ bool DictFSSTCompressionState::UpdateState(Vector &scan_vector, idx_t count) {
auto data = UnifiedVectorFormat::GetData<string_t>(vdata);
Verify();

ProcessStrings(vdata, count);

for (idx_t i = 0; i < count; i++) {
auto idx = vdata.sel->get_index(i);
idx_t string_size = 0;
optional_idx lookup_result;
auto row_is_valid = vdata.validity.RowIsValid(idx);

auto &str = GetString(data, idx, i);
if (row_is_valid) {
string_size = data[idx].GetSize();
string_size = str.GetSize();
if (string_size >= StringUncompressed::GetStringBlockLimit(info.GetBlockSize())) {
// Big strings not implemented for dictionary compression
return false;
}
lookup_result = LookupString(data[idx]);
lookup_result = LookupString(str);
}

bool new_string = !lookup_result.IsValid();
Expand All @@ -81,7 +84,7 @@ bool DictFSSTCompressionState::UpdateState(Vector &scan_vector, idx_t count) {
} else if (lookup_result.IsValid()) {
AddLookup(UnsafeNumericCast<uint32_t>(lookup_result.GetIndex()));
} else {
AddNewString(data[idx]);
AddNewString(str);
}

Verify();
Expand Down
17 changes: 17 additions & 0 deletions src/storage/compression/dict_fsst/compression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,23 @@ void DictFSSTCompressionCompressState::Flush(bool final) {
}
}

void DictFSSTCompressionCompressState::ProcessStrings(UnifiedVectorFormat &input, idx_t count) {
if (!fsst_encoded) {
// No need to process anything
return;
}
throw NotImplementedException("FSST ENCODED PROCESS STRINGS");
// TODO: perform fsst encoding on the provided strings
}

const string_t &DictFSSTCompressionCompressState::GetString(const string_t *strings, idx_t index, idx_t raw_index) {
if (!fsst_encoded) {
return strings[index];
}
throw NotImplementedException("FSST ENCODED GET STRING");
// TODO: look up the encoded string given the 'raw_index'
}

idx_t DictFSSTCompressionCompressState::Finalize() {
auto &buffer_manager = BufferManager::GetBufferManager(checkpoint_data.GetDatabase());
auto handle = buffer_manager.Pin(current_segment->block);
Expand Down
2 changes: 1 addition & 1 deletion test/sql/storage/compression/dictionary/fetch_row.test
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ require block_size 262144
load __TEST_DIR__/test_dictionary_fetchrow.db

statement ok
PRAGMA force_compression = 'dictionary'
PRAGMA force_compression = 'dict_fsst'

statement ok
CREATE TABLE test (
Expand Down
4 changes: 2 additions & 2 deletions test/sql/storage/compression/dictionary/force_dictionary.test
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ require vector_size 2048
load __TEST_DIR__/force_dictionary.db

statement ok
PRAGMA force_compression = 'dictionary'
PRAGMA force_compression = 'dict_fsst';

statement ok
CREATE TABLE test_dict (a VARCHAR);
Expand All @@ -21,4 +21,4 @@ CHECKPOINT
query I
SELECT compression FROM pragma_storage_info('test_dict') WHERE segment_type ILIKE 'VARCHAR' LIMIT 1
----
Dictionary
Dict FSST

0 comments on commit 468db86

Please sign in to comment.