forked from duckdb/duckdb
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
24 changed files
with
870 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
50 changes: 50 additions & 0 deletions
50
src/include/duckdb/storage/compression/dict_fsst/analyze.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
#pragma once | ||
|
||
#include "duckdb/storage/compression/dictionary/common.hpp" | ||
#include "duckdb/common/string_map_set.hpp" | ||
#include "duckdb/storage/table/column_data.hpp" | ||
|
||
namespace duckdb { | ||
|
||
namespace dict_fsst { | ||
|
||
//===--------------------------------------------------------------------===// | ||
// Analyze | ||
//===--------------------------------------------------------------------===// | ||
struct DictFSSTAnalyzeState : public DictFSSTCompressionState { | ||
public: | ||
explicit DictFSSTAnalyzeState(const CompressionInfo &info); | ||
|
||
public: | ||
bool LookupString(string_t str) override; | ||
void AddNewString(string_t str) override; | ||
void AddLastLookup() override; | ||
void AddNull() override; | ||
bool CalculateSpaceRequirements(bool new_string, idx_t string_size) override; | ||
void Flush(bool final = false) override; | ||
void Verify() override; | ||
|
||
public: | ||
idx_t segment_count; | ||
idx_t current_tuple_count; | ||
idx_t current_unique_count; | ||
idx_t current_dict_size; | ||
StringHeap heap; | ||
string_set_t current_set; | ||
bitpacking_width_t current_width; | ||
bitpacking_width_t next_width; | ||
}; | ||
|
||
struct DictFSSTCompressionAnalyzeState : public AnalyzeState { | ||
public: | ||
explicit DictFSSTCompressionAnalyzeState(const CompressionInfo &info) | ||
: AnalyzeState(info), analyze_state(make_uniq<DictFSSTAnalyzeState>(info)) { | ||
} | ||
|
||
public: | ||
unique_ptr<DictFSSTAnalyzeState> analyze_state; | ||
}; | ||
|
||
} // namespace dict_fsst | ||
|
||
} // namespace duckdb |
64 changes: 64 additions & 0 deletions
64
src/include/duckdb/storage/compression/dict_fsst/common.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#pragma once | ||
|
||
#include "duckdb/common/typedefs.hpp" | ||
#include "duckdb/function/compression_function.hpp" | ||
#include "duckdb/common/bitpacking.hpp" | ||
#include "duckdb/storage/string_uncompressed.hpp" | ||
|
||
namespace duckdb { | ||
|
||
namespace dict_fsst { | ||
|
||
typedef struct { | ||
uint32_t dict_size; | ||
uint32_t dict_end; | ||
uint32_t index_buffer_offset; | ||
uint32_t index_buffer_count; | ||
uint32_t bitpacking_width; | ||
} dict_fsst_compression_header_t; | ||
|
||
struct DictFSSTCompression { | ||
public: | ||
static constexpr float MINIMUM_COMPRESSION_RATIO = 1.2F; | ||
//! Dictionary header size at the beginning of the string segment (offset + length) | ||
static constexpr uint16_t DICTIONARY_HEADER_SIZE = sizeof(dict_fsst_compression_header_t); | ||
|
||
public: | ||
static bool HasEnoughSpace(idx_t current_count, idx_t index_count, idx_t dict_size, | ||
bitpacking_width_t packing_width, const idx_t block_size); | ||
static idx_t RequiredSpace(idx_t current_count, idx_t index_count, idx_t dict_size, | ||
bitpacking_width_t packing_width); | ||
|
||
static StringDictionaryContainer GetDictionary(ColumnSegment &segment, BufferHandle &handle); | ||
static void SetDictionary(ColumnSegment &segment, BufferHandle &handle, StringDictionaryContainer container); | ||
}; | ||
|
||
//! Abstract class managing the compression state for size analysis or compression. | ||
class DictFSSTCompressionState : public CompressionState { | ||
public: | ||
explicit DictFSSTCompressionState(const CompressionInfo &info); | ||
~DictFSSTCompressionState() override; | ||
|
||
public: | ||
bool UpdateState(Vector &scan_vector, idx_t count); | ||
|
||
protected: | ||
// Should verify the State | ||
virtual void Verify() = 0; | ||
// Performs a lookup of str, storing the result internally | ||
virtual bool LookupString(string_t str) = 0; | ||
// Add the most recently looked up str to compression state | ||
virtual void AddLastLookup() = 0; | ||
// Add string to the state that is known to not be seen yet | ||
virtual void AddNewString(string_t str) = 0; | ||
// Add a null value to the compression state | ||
virtual void AddNull() = 0; | ||
// Needs to be called before adding a value. Will return false if a flush is required first. | ||
virtual bool CalculateSpaceRequirements(bool new_string, idx_t string_size) = 0; | ||
// Flush the segment to disk if compressing or reset the counters if analyzing | ||
virtual void Flush(bool final = false) = 0; | ||
}; | ||
|
||
} // namespace dict_fsst | ||
|
||
} // namespace duckdb |
64 changes: 64 additions & 0 deletions
64
src/include/duckdb/storage/compression/dict_fsst/compression.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
#pragma once | ||
|
||
#include "duckdb/common/typedefs.hpp" | ||
#include "duckdb/storage/compression/dictionary/common.hpp" | ||
#include "duckdb/function/compression_function.hpp" | ||
#include "duckdb/common/string_map_set.hpp" | ||
#include "duckdb/storage/table/column_data_checkpointer.hpp" | ||
|
||
namespace duckdb { | ||
namespace dict_fsst { | ||
|
||
// Dictionary compression uses a combination of bitpacking and a dictionary to compress string segments. The data is | ||
// stored across three buffers: the index buffer, the selection buffer and the dictionary. Firstly the Index buffer | ||
// contains the offsets into the dictionary which are also used to determine the string lengths. Each value in the | ||
// dictionary gets a single unique index in the index buffer. Secondly, the selection buffer maps the tuples to an index | ||
// in the index buffer. The selection buffer is compressed with bitpacking. Finally, the dictionary contains simply all | ||
// the unique strings without lengths or null termination as we can deduce the lengths from the index buffer. The | ||
// addition of the selection buffer is done for two reasons: firstly, to allow the scan to emit dictionary vectors by | ||
// scanning the whole dictionary at once and then scanning the selection buffer for each emitted vector. Secondly, it | ||
// allows for efficient bitpacking compression as the selection values should remain relatively small. | ||
|
||
//===--------------------------------------------------------------------===// | ||
// Compress | ||
//===--------------------------------------------------------------------===// | ||
struct DictFSSTCompressionCompressState : public DictFSSTCompressionState { | ||
public: | ||
DictFSSTCompressionCompressState(ColumnDataCheckpointData &checkpoint_data_p, const CompressionInfo &info); | ||
|
||
public: | ||
void CreateEmptySegment(idx_t row_start); | ||
void Verify() override; | ||
bool LookupString(string_t str) override; | ||
void AddNewString(string_t str) override; | ||
void AddNull() override; | ||
void AddLastLookup() override; | ||
bool CalculateSpaceRequirements(bool new_string, idx_t string_size) override; | ||
void Flush(bool final = false) override; | ||
idx_t Finalize(); | ||
|
||
public: | ||
ColumnDataCheckpointData &checkpoint_data; | ||
CompressionFunction &function; | ||
|
||
// State regarding current segment | ||
unique_ptr<ColumnSegment> current_segment; | ||
BufferHandle current_handle; | ||
StringDictionaryContainer current_dictionary; | ||
data_ptr_t current_end_ptr; | ||
|
||
// Buffers and map for current segment | ||
StringHeap heap; | ||
string_map_t<uint32_t> current_string_map; | ||
vector<uint32_t> index_buffer; | ||
vector<uint32_t> selection_buffer; | ||
|
||
bitpacking_width_t current_width = 0; | ||
bitpacking_width_t next_width = 0; | ||
|
||
// Result of latest LookupString call | ||
uint32_t latest_lookup_result; | ||
}; | ||
|
||
} // namespace dict_fsst | ||
} // namespace duckdb |
54 changes: 54 additions & 0 deletions
54
src/include/duckdb/storage/compression/dict_fsst/decompression.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#pragma once | ||
|
||
#include "duckdb/storage/compression/dict_fsst/common.hpp" | ||
|
||
namespace duckdb { | ||
|
||
namespace dict_fsst { | ||
|
||
//===--------------------------------------------------------------------===// | ||
// Scan | ||
//===--------------------------------------------------------------------===// | ||
// FIXME: why is this StringScanState when we also define: `BufferHandle handle` ??? | ||
struct CompressedStringScanState : public StringScanState { | ||
public: | ||
explicit CompressedStringScanState(BufferHandle &&handle_p) | ||
: StringScanState(), owned_handle(std::move(handle_p)), handle(owned_handle) { | ||
} | ||
explicit CompressedStringScanState(BufferHandle &handle_p) : StringScanState(), owned_handle(), handle(handle_p) { | ||
} | ||
|
||
public: | ||
void Initialize(ColumnSegment &segment, bool initialize_dictionary = true); | ||
void ScanToFlatVector(Vector &result, idx_t result_offset, idx_t start, idx_t scan_count); | ||
void ScanToDictionaryVector(ColumnSegment &segment, Vector &result, idx_t result_offset, idx_t start, | ||
idx_t scan_count); | ||
|
||
private: | ||
string_t FetchStringFromDict(int32_t dict_offset, uint16_t string_len); | ||
uint16_t GetStringLength(sel_t index); | ||
|
||
public: | ||
BufferHandle owned_handle; | ||
optional_ptr<BufferHandle> handle; | ||
|
||
bitpacking_width_t current_width; | ||
buffer_ptr<SelectionVector> sel_vec; | ||
idx_t sel_vec_size = 0; | ||
|
||
//! Start of the block (pointing to the dictionary_header) | ||
data_ptr_t baseptr; | ||
//! Start of the data (pointing to the start of the selection buffer) | ||
data_ptr_t base_data; | ||
uint32_t *index_buffer_ptr; | ||
uint32_t index_buffer_count; | ||
|
||
buffer_ptr<Vector> dictionary; | ||
idx_t dictionary_size; | ||
StringDictionaryContainer dict; | ||
idx_t block_size; | ||
}; | ||
|
||
} // namespace dict_fsst | ||
|
||
} // namespace duckdb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.