Skip to content

Commit

Permalink
implicit conversions galore, but it compiles now
Browse files Browse the repository at this point in the history
  • Loading branch information
Tishj committed Jan 16, 2025
1 parent f7344eb commit 22771bf
Show file tree
Hide file tree
Showing 7 changed files with 526 additions and 688 deletions.
77 changes: 1 addition & 76 deletions src/include/duckdb/storage/compression/dict_fsst/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ typedef struct {
uint32_t string_lengths_offset;
uint32_t string_lengths_width;
uint32_t dict_count;
uint32_t bitpacking_width;
uint32_t dictionary_indices_width;
DictFSSTMode mode;
} dict_fsst_compression_header_t;

Expand All @@ -41,85 +41,10 @@ struct DictFSSTCompression {
static constexpr idx_t STRING_SIZE_LIMIT = 16384;

public:
static bool HasEnoughSpace(idx_t current_count, idx_t index_count, idx_t dict_size,
bitpacking_width_t packing_width, bitpacking_width_t string_lengths_width,
const idx_t block_size);
static idx_t RequiredSpace(idx_t current_count, idx_t index_count, idx_t dict_size,
bitpacking_width_t packing_width, bitpacking_width_t string_lengths_width);

static StringDictionaryContainer GetDictionary(ColumnSegment &segment, BufferHandle &handle);
static void SetDictionary(ColumnSegment &segment, BufferHandle &handle, StringDictionaryContainer container);
};

struct StringData {
public:
explicit StringData(const string_t &string, optional_ptr<const string_t> encoded_string = nullptr)
: string(string), encoded_string(encoded_string) {
}

public:
const string_t &Get() const {
if (encoded_string) {
return *encoded_string;
}
return string;
}

public:
const string_t &string;
optional_ptr<const string_t> encoded_string;
};

//! Abstract class managing the compression state for size analysis or compression.
class DictFSSTCompressionState : public CompressionState {
public:
static constexpr idx_t DICTIONARY_ENCODE_THRESHOLD = 4096;

public:
explicit DictFSSTCompressionState(const CompressionInfo &info);
~DictFSSTCompressionState() override;

public:
bool UpdateState(Vector &scan_vector, idx_t count);

protected:
// Should verify the State
virtual void Verify() = 0;
// Performs a lookup of str, storing the result internally
virtual optional_idx LookupString(const string_t &str) = 0;
// Add the most recently looked up str to compression state
virtual void AddLookup(uint32_t lookup_result) = 0;
// Add string to the state that is known to not be seen yet
virtual void AddNewString(const StringData &str) = 0;
// Add a null value to the compression state
virtual void AddNull() = 0;
virtual idx_t RequiredSpace(bool new_string, idx_t string_size) = 0;
// Flush the segment to disk if compressing or reset the counters if analyzing
virtual void Flush(bool final = false) = 0;
virtual void UpdateStats(UnifiedVectorFormat &input, idx_t count) {/* no-op */};
// Process the strings of the vector if necessary
virtual void EncodeInputStrings(UnifiedVectorFormat &input, idx_t count) = 0;
// Encode the dictionary with FSST, return false if we decided not to encode
virtual bool EncodeDictionary() = 0;
// Retrieve the string given the indices
virtual StringData GetString(const string_t *strings, idx_t index, idx_t raw_index) = 0;

bool IsEncoded() const {
return append_state == DictionaryAppendState::ENCODED ||
append_state == DictionaryAppendState::ENCODED_ALL_UNIQUE;
}

private:
bool DryAppendToCurrentSegment(bool is_new, UnifiedVectorFormat &vdata, idx_t count, idx_t index, idx_t raw_index);

public:
//! Keep track of the append state for the current segment
DictionaryAppendState append_state = DictionaryAppendState::REGULAR;
void *encoder = nullptr;
idx_t symbol_table_size = DConstants::INVALID_INDEX;
bool all_unique = true;
};

} // namespace dict_fsst

} // namespace duckdb
40 changes: 26 additions & 14 deletions src/include/duckdb/storage/compression/dict_fsst/compression.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,34 @@ namespace dict_fsst {
// scanning the whole dictionary at once and then scanning the selection buffer for each emitted vector. Secondly, it
// allows for efficient bitpacking compression as the selection values should remain relatively small.

struct EncodedInput {
//! The index at which we started encoding the input
// in case we switch to FSST_ONLY in the middle, we can avoid encoding the previous input strings
idx_t offset;
//! If the append_mode is FSST_ONLY we will encode all input
// this memory is owned by a reusable buffer stored in the state
vector<string_t> data;
};

//===--------------------------------------------------------------------===//
// Compress
//===--------------------------------------------------------------------===//
struct DictFSSTCompressionCompressState : public CompressionState {
struct DictFSSTCompressionState : public CompressionState {
public:
DictFSSTCompressionCompressState(ColumnDataCheckpointData &checkpoint_data_p,
unique_ptr<DictFSSTAnalyzeState> &&state);
~DictFSSTCompressionCompressState() override;
DictFSSTCompressionState(ColumnDataCheckpointData &checkpoint_data_p, unique_ptr<DictFSSTAnalyzeState> &&state);
~DictFSSTCompressionState() override;

public:
void CreateEmptySegment(idx_t row_start);
idx_t Finalize();

void FlushEncodingBuffer();
DictionaryAppendState SwitchAppendState();

bool CompressInternal(UnifiedVectorFormat &vector_format, EncodedInput &encoded_input, idx_t i, idx_t count);
void Compress(Vector &scan_vector, idx_t count);
void FinalizeCompress();
void Flush();
void Flush(bool final);

public:
ColumnDataCheckpointData &checkpoint_data;
Expand All @@ -47,8 +59,7 @@ struct DictFSSTCompressionCompressState : public CompressionState {
//! Offset at which to write the next dictionary string
idx_t dictionary_offset = 0;

public:
idx_t string_lengths_space;
idx_t string_lengths_space = 0;
vector<uint32_t> string_lengths;
idx_t dict_count = 0;
bitpacking_width_t string_lengths_width = 0;
Expand All @@ -58,7 +69,7 @@ struct DictFSSTCompressionCompressState : public CompressionState {
bitpacking_width_t real_string_lengths_width = 0;
uint32_t max_string_length = 0;

idx_t dictionary_indices_space;
idx_t dictionary_indices_space = 0;
vector<uint32_t> dictionary_indices;
bitpacking_width_t dictionary_indices_width = 0;
//! uint32_t max_dictionary_index; (this is 'dict_count')
Expand All @@ -67,22 +78,23 @@ struct DictFSSTCompressionCompressState : public CompressionState {
string_map_t<uint32_t> current_string_map;
//! strings added to the dictionary waiting to be encoded
vector<string_t> dictionary_encoding_buffer;
idx_t to_encode_string_sum = 0;
//! for DICT_FSST we store uncompressed strings in the 'current_string_map', this owns that memory
StringHeap uncompressed_dictionary_copy;

//! This is used for FSST_ONLY, to store the memory of the encoded input
unsafe_unique_array<unsigned char> encoding_buffer;
idx_t encoding_buffer_size = 0;

public:
idx_t tuple_count = 0;
unique_ptr<DictFSSTAnalyzeState> analyze;
bool all_unique = true;

private:
void *encoder = nullptr;
unsafe_unique_array<unsigned char> fsst_serialized_symbol_table;
idx_t symbol_table_size = DConstants::INVALID_INDEX;
DictionaryAppendState append_state = DictionaryAppendState::REGULAR;
bool all_unique = true;

public:
idx_t tuple_count = 0;
unique_ptr<DictFSSTAnalyzeState> analyze;
};

} // namespace dict_fsst
Expand Down
12 changes: 5 additions & 7 deletions src/storage/compression/dict_fsst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,19 +88,17 @@ idx_t DictFSSTCompressionStorage::StringFinalAnalyze(AnalyzeState &state_p) {
//===--------------------------------------------------------------------===//
unique_ptr<CompressionState> DictFSSTCompressionStorage::InitCompression(ColumnDataCheckpointData &checkpoint_data,
unique_ptr<AnalyzeState> state) {
return make_uniq<DictFSSTCompressionCompressState>(
checkpoint_data, unique_ptr_cast<AnalyzeState, DictFSSTAnalyzeState>(std::move(state)));
return make_uniq<DictFSSTCompressionState>(checkpoint_data,
unique_ptr_cast<AnalyzeState, DictFSSTAnalyzeState>(std::move(state)));
}

void DictFSSTCompressionStorage::Compress(CompressionState &state_p, Vector &scan_vector, idx_t count) {
auto &state = state_p.Cast<DictFSSTCompressionCompressState>();
auto res = state.UpdateState(scan_vector, count);
(void)(res);
D_ASSERT(res);
auto &state = state_p.Cast<DictFSSTCompressionState>();
state.Compress(scan_vector, count);
}

void DictFSSTCompressionStorage::FinalizeCompress(CompressionState &state_p) {
auto &state = state_p.Cast<DictFSSTCompressionCompressState>();
auto &state = state_p.Cast<DictFSSTCompressionState>();
state.Flush(true);
}

Expand Down
2 changes: 1 addition & 1 deletion src/storage/compression/dict_fsst/analyze.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
namespace duckdb {
namespace dict_fsst {

DictFSSTAnalyzeState::DictFSSTAnalyzeState(const CompressionInfo &info) : DictFSSTCompressionState(info) {
DictFSSTAnalyzeState::DictFSSTAnalyzeState(const CompressionInfo &info) : AnalyzeState(info) {
}

bool DictFSSTAnalyzeState::Analyze(Vector &input, idx_t count) {
Expand Down
Loading

0 comments on commit 22771bf

Please sign in to comment.