Skip to content

Commit

Permalink
WIP: wrong answer when ENCODED_ALL_UNIQUE is enabled... needs investi…
Browse files Browse the repository at this point in the history
…gation | also some weird patching included for a possible FSST bug
  • Loading branch information
Tishj committed Jan 20, 2025
1 parent aec9b85 commit f78187c
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,13 @@ struct DictFSSTCompressionState : public CompressionState {

idx_t tuple_count = 0;
unique_ptr<DictFSSTAnalyzeState> analyze;
//! FIXME: do we even need this? It's equivalent to: `string_lengths.size() - 1 == tuple_count`
bool all_unique = true;
idx_t symbol_table_size = DConstants::INVALID_INDEX;

//! How many values have we compressed so far?
idx_t total_tuple_count = 0;

private:
void *encoder = nullptr;
unsafe_unique_array<unsigned char> fsst_serialized_symbol_table;
Expand Down
31 changes: 15 additions & 16 deletions src/storage/compression/dict_fsst/compression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,13 +184,18 @@ void DictFSSTCompressionState::CreateEmptySegment(idx_t row_start) {

append_state = DictionaryAppendState::REGULAR;
string_lengths_width = 0;
real_string_lengths_width = 0;
dictionary_indices_width = 0;
string_lengths_space = 0;
D_ASSERT(dictionary_indices.empty());
dictionary_indices_space = 0;
all_unique = true;
tuple_count = 0;
D_ASSERT(string_lengths.empty());
string_lengths.push_back(0);
dict_count = 1;
D_ASSERT(current_string_map.empty());
symbol_table_size = DConstants::INVALID_INDEX;

dictionary_offset = 0;
current_dictionary.end = 0;
Expand Down Expand Up @@ -231,6 +236,7 @@ void DictFSSTCompressionState::Flush(bool final) {
encoder = nullptr;
symbol_table_size = DConstants::INVALID_INDEX;
}
total_tuple_count += tuple_count;

if (!final) {
CreateEmptySegment(next_start);
Expand Down Expand Up @@ -407,13 +413,6 @@ bool DictFSSTCompressionState::CompressInternal(UnifiedVectorFormat &vector_form
// so we can avoid recalculating for every tuple
const bool recalculate_indices_space = ((tuple_count % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE) == 0);

#ifdef DEBUG
// TODO: remove this
if (recalculate_indices_space) {
(void)encoded_input;
}
#endif

switch (append_state) {
case DictionaryAppendState::NOT_ENCODED:
case DictionaryAppendState::REGULAR: {
Expand Down Expand Up @@ -542,11 +541,11 @@ DictionaryAppendState DictFSSTCompressionState::SwitchAppendState() {
}

DictionaryAppendState new_state;
// if (!analyze->contains_nulls && all_unique) {
// new_state = DictionaryAppendState::ENCODED_ALL_UNIQUE;
//} else {
new_state = DictionaryAppendState::ENCODED;
//}
if (!analyze->contains_nulls && all_unique) {
new_state = DictionaryAppendState::ENCODED_ALL_UNIQUE;
} else {
new_state = DictionaryAppendState::ENCODED;
}

vector<size_t> fsst_string_sizes;
vector<unsigned char *> fsst_string_ptrs;
Expand Down Expand Up @@ -614,8 +613,8 @@ DictionaryAppendState DictFSSTCompressionState::SwitchAppendState() {
symbol_table_size = duckdb_fsst_export(fsst_encoder, fsst_serialized_symbol_table.get());

#ifdef DEBUG
duckdb_fsst_decoder_t decoder;
duckdb_fsst_import(&decoder, fsst_serialized_symbol_table.get());
auto temp_decoder = alloca(sizeof(duckdb_fsst_decoder_t) + sizeof(unsigned long long));
duckdb_fsst_import((duckdb_fsst_decoder_t *)temp_decoder, fsst_serialized_symbol_table.get());

vector<unsigned char> decompress_buffer;
#endif
Expand All @@ -640,8 +639,8 @@ DictionaryAppendState DictFSSTCompressionState::SwitchAppendState() {

#ifdef DEBUG
//! Verify that we can decompress the string
decompress_buffer.resize(uncompressed_str_len + 1);
FSSTPrimitives::DecompressValue((void *)&decoder, (const char *)compressed_ptrs[i],
decompress_buffer.resize(uncompressed_str_len + 1 + 100);
FSSTPrimitives::DecompressValue((void *)temp_decoder, (const char *)compressed_ptrs[i],
(idx_t)compressed_sizes[i], decompress_buffer);

string_t decompressed_string((const char *)decompress_buffer.data(), uncompressed_str_len);
Expand Down
2 changes: 1 addition & 1 deletion test/sql/storage/compression/dict_fsst/dict_fsst_test.test
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ select
(i % 200)::INTEGER::VARCHAR,
2047 // len((i % 200)::INTEGER::VARCHAR)
) a
from range(200) t(i);
from range(2000) t(i);

statement ok
checkpoint;
Expand Down

0 comments on commit f78187c

Please sign in to comment.