Skip to content

Commit

Permalink
dict_fsst start
Browse files Browse the repository at this point in the history
  • Loading branch information
Tishj committed Jan 7, 2025
1 parent 39f9990 commit c7b7690
Show file tree
Hide file tree
Showing 24 changed files with 870 additions and 8 deletions.
8 changes: 7 additions & 1 deletion src/common/enums/compression_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ vector<string> ListCompressionTypes(void) {
bool CompressionTypeIsDeprecated(CompressionType compression_type) {
const bool is_patas = compression_type == CompressionType::COMPRESSION_PATAS;
const bool is_chimp = compression_type == CompressionType::COMPRESSION_CHIMP;
return (is_patas || is_chimp);
const bool is_dict = compression_type == CompressionType::COMPRESSION_DICTIONARY;
const bool is_fsst = compression_type == CompressionType::COMPRESSION_FSST;
return (is_patas || is_chimp || is_dict || is_fsst);
}

CompressionType CompressionTypeFromString(const string &str) {
Expand Down Expand Up @@ -50,6 +52,8 @@ CompressionType CompressionTypeFromString(const string &str) {
return CompressionType::COMPRESSION_ALPRD;
} else if (compression == "roaring") {
return CompressionType::COMPRESSION_ROARING;
} else if (compression == "dict_fsst") {
return CompressionType::COMPRESSION_DICT_FSST;
} else {
return CompressionType::COMPRESSION_AUTO;
}
Expand Down Expand Up @@ -85,6 +89,8 @@ string CompressionTypeToString(CompressionType type) {
return "ALPRD";
case CompressionType::COMPRESSION_ROARING:
return "Roaring";
case CompressionType::COMPRESSION_DICT_FSST:
return "DICT_FSST";
case CompressionType::COMPRESSION_EMPTY:
return "Empty Validity";
default:
Expand Down
2 changes: 2 additions & 0 deletions src/function/compression_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ static const DefaultCompressionMethod internal_compression_methods[] = {
{CompressionType::COMPRESSION_ROARING, RoaringCompressionFun::GetFunction, RoaringCompressionFun::TypeIsSupported},
{CompressionType::COMPRESSION_EMPTY, EmptyValidityCompressionFun::GetFunction,
EmptyValidityCompressionFun::TypeIsSupported},
{CompressionType::COMPRESSION_DICT_FSST, DictFSSTCompressionFun::GetFunction,
DictFSSTCompressionFun::TypeIsSupported},
{CompressionType::COMPRESSION_AUTO, nullptr, nullptr}};

static optional_ptr<CompressionFunction> FindCompressionFunction(CompressionFunctionSet &set, CompressionType type,
Expand Down
3 changes: 2 additions & 1 deletion src/include/duckdb/common/enums/compression_type.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ enum class CompressionType : uint8_t {
COMPRESSION_ZSTD = 12,
COMPRESSION_ROARING = 13,
COMPRESSION_EMPTY = 14, // internal only
COMPRESSION_COUNT // This has to stay the last entry of the type!
COMPRESSION_DICT_FSST = 15,
COMPRESSION_COUNT // This has to stay the last entry of the type!
};

bool CompressionTypeIsDeprecated(CompressionType compression_type);
Expand Down
5 changes: 5 additions & 0 deletions src/include/duckdb/function/compression/compression.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ struct DictionaryCompressionFun {
static bool TypeIsSupported(const PhysicalType physical_type);
};

struct DictFSSTCompressionFun {
static CompressionFunction GetFunction(PhysicalType type);
static bool TypeIsSupported(const PhysicalType physical_type);
};

struct ChimpCompressionFun {
static CompressionFunction GetFunction(PhysicalType type);
static bool TypeIsSupported(const PhysicalType physical_type);
Expand Down
50 changes: 50 additions & 0 deletions src/include/duckdb/storage/compression/dict_fsst/analyze.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#pragma once

#include "duckdb/storage/compression/dictionary/common.hpp"
#include "duckdb/common/string_map_set.hpp"
#include "duckdb/storage/table/column_data.hpp"

namespace duckdb {

namespace dict_fsst {

//===--------------------------------------------------------------------===//
// Analyze
//===--------------------------------------------------------------------===//
struct DictFSSTAnalyzeState : public DictFSSTCompressionState {
public:
explicit DictFSSTAnalyzeState(const CompressionInfo &info);

public:
bool LookupString(string_t str) override;
void AddNewString(string_t str) override;
void AddLastLookup() override;
void AddNull() override;
bool CalculateSpaceRequirements(bool new_string, idx_t string_size) override;
void Flush(bool final = false) override;
void Verify() override;

public:
idx_t segment_count;
idx_t current_tuple_count;
idx_t current_unique_count;
idx_t current_dict_size;
StringHeap heap;
string_set_t current_set;
bitpacking_width_t current_width;
bitpacking_width_t next_width;
};

struct DictFSSTCompressionAnalyzeState : public AnalyzeState {
public:
explicit DictFSSTCompressionAnalyzeState(const CompressionInfo &info)
: AnalyzeState(info), analyze_state(make_uniq<DictFSSTAnalyzeState>(info)) {
}

public:
unique_ptr<DictFSSTAnalyzeState> analyze_state;
};

} // namespace dict_fsst

} // namespace duckdb
64 changes: 64 additions & 0 deletions src/include/duckdb/storage/compression/dict_fsst/common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#pragma once

#include "duckdb/common/typedefs.hpp"
#include "duckdb/function/compression_function.hpp"
#include "duckdb/common/bitpacking.hpp"
#include "duckdb/storage/string_uncompressed.hpp"

namespace duckdb {

namespace dict_fsst {

typedef struct {
uint32_t dict_size;
uint32_t dict_end;
uint32_t index_buffer_offset;
uint32_t index_buffer_count;
uint32_t bitpacking_width;
} dict_fsst_compression_header_t;

struct DictFSSTCompression {
public:
static constexpr float MINIMUM_COMPRESSION_RATIO = 1.2F;
//! Dictionary header size at the beginning of the string segment (offset + length)
static constexpr uint16_t DICTIONARY_HEADER_SIZE = sizeof(dict_fsst_compression_header_t);

public:
static bool HasEnoughSpace(idx_t current_count, idx_t index_count, idx_t dict_size,
bitpacking_width_t packing_width, const idx_t block_size);
static idx_t RequiredSpace(idx_t current_count, idx_t index_count, idx_t dict_size,
bitpacking_width_t packing_width);

static StringDictionaryContainer GetDictionary(ColumnSegment &segment, BufferHandle &handle);
static void SetDictionary(ColumnSegment &segment, BufferHandle &handle, StringDictionaryContainer container);
};

//! Abstract class managing the compression state for size analysis or compression.
class DictFSSTCompressionState : public CompressionState {
public:
explicit DictFSSTCompressionState(const CompressionInfo &info);
~DictFSSTCompressionState() override;

public:
bool UpdateState(Vector &scan_vector, idx_t count);

protected:
// Should verify the State
virtual void Verify() = 0;
// Performs a lookup of str, storing the result internally
virtual bool LookupString(string_t str) = 0;
// Add the most recently looked up str to compression state
virtual void AddLastLookup() = 0;
// Add string to the state that is known to not be seen yet
virtual void AddNewString(string_t str) = 0;
// Add a null value to the compression state
virtual void AddNull() = 0;
// Needs to be called before adding a value. Will return false if a flush is required first.
virtual bool CalculateSpaceRequirements(bool new_string, idx_t string_size) = 0;
// Flush the segment to disk if compressing or reset the counters if analyzing
virtual void Flush(bool final = false) = 0;
};

} // namespace dict_fsst

} // namespace duckdb
64 changes: 64 additions & 0 deletions src/include/duckdb/storage/compression/dict_fsst/compression.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#pragma once

#include "duckdb/common/typedefs.hpp"
#include "duckdb/storage/compression/dictionary/common.hpp"
#include "duckdb/function/compression_function.hpp"
#include "duckdb/common/string_map_set.hpp"
#include "duckdb/storage/table/column_data_checkpointer.hpp"

namespace duckdb {
namespace dict_fsst {

// Dictionary compression uses a combination of bitpacking and a dictionary to compress string segments. The data is
// stored across three buffers: the index buffer, the selection buffer and the dictionary. Firstly the Index buffer
// contains the offsets into the dictionary which are also used to determine the string lengths. Each value in the
// dictionary gets a single unique index in the index buffer. Secondly, the selection buffer maps the tuples to an index
// in the index buffer. The selection buffer is compressed with bitpacking. Finally, the dictionary contains simply all
// the unique strings without lengths or null termination as we can deduce the lengths from the index buffer. The
// addition of the selection buffer is done for two reasons: firstly, to allow the scan to emit dictionary vectors by
// scanning the whole dictionary at once and then scanning the selection buffer for each emitted vector. Secondly, it
// allows for efficient bitpacking compression as the selection values should remain relatively small.

//===--------------------------------------------------------------------===//
// Compress
//===--------------------------------------------------------------------===//
struct DictFSSTCompressionCompressState : public DictFSSTCompressionState {
public:
DictFSSTCompressionCompressState(ColumnDataCheckpointData &checkpoint_data_p, const CompressionInfo &info);

public:
void CreateEmptySegment(idx_t row_start);
void Verify() override;
bool LookupString(string_t str) override;
void AddNewString(string_t str) override;
void AddNull() override;
void AddLastLookup() override;
bool CalculateSpaceRequirements(bool new_string, idx_t string_size) override;
void Flush(bool final = false) override;
idx_t Finalize();

public:
ColumnDataCheckpointData &checkpoint_data;
CompressionFunction &function;

// State regarding current segment
unique_ptr<ColumnSegment> current_segment;
BufferHandle current_handle;
StringDictionaryContainer current_dictionary;
data_ptr_t current_end_ptr;

// Buffers and map for current segment
StringHeap heap;
string_map_t<uint32_t> current_string_map;
vector<uint32_t> index_buffer;
vector<uint32_t> selection_buffer;

bitpacking_width_t current_width = 0;
bitpacking_width_t next_width = 0;

// Result of latest LookupString call
uint32_t latest_lookup_result;
};

} // namespace dict_fsst
} // namespace duckdb
54 changes: 54 additions & 0 deletions src/include/duckdb/storage/compression/dict_fsst/decompression.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#pragma once

#include "duckdb/storage/compression/dict_fsst/common.hpp"

namespace duckdb {

namespace dict_fsst {

//===--------------------------------------------------------------------===//
// Scan
//===--------------------------------------------------------------------===//
// FIXME: why is this StringScanState when we also define: `BufferHandle handle` ???
struct CompressedStringScanState : public StringScanState {
public:
explicit CompressedStringScanState(BufferHandle &&handle_p)
: StringScanState(), owned_handle(std::move(handle_p)), handle(owned_handle) {
}
explicit CompressedStringScanState(BufferHandle &handle_p) : StringScanState(), owned_handle(), handle(handle_p) {
}

public:
void Initialize(ColumnSegment &segment, bool initialize_dictionary = true);
void ScanToFlatVector(Vector &result, idx_t result_offset, idx_t start, idx_t scan_count);
void ScanToDictionaryVector(ColumnSegment &segment, Vector &result, idx_t result_offset, idx_t start,
idx_t scan_count);

private:
string_t FetchStringFromDict(int32_t dict_offset, uint16_t string_len);
uint16_t GetStringLength(sel_t index);

public:
BufferHandle owned_handle;
optional_ptr<BufferHandle> handle;

bitpacking_width_t current_width;
buffer_ptr<SelectionVector> sel_vec;
idx_t sel_vec_size = 0;

//! Start of the block (pointing to the dictionary_header)
data_ptr_t baseptr;
//! Start of the data (pointing to the start of the selection buffer)
data_ptr_t base_data;
uint32_t *index_buffer_ptr;
uint32_t index_buffer_count;

buffer_ptr<Vector> dictionary;
idx_t dictionary_size;
StringDictionaryContainer dict;
idx_t block_size;
};

} // namespace dict_fsst

} // namespace duckdb
2 changes: 2 additions & 0 deletions src/include/duckdb/storage/compression/dictionary/analyze.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "duckdb/storage/table/column_data.hpp"

namespace duckdb {
namespace dictionary {

//===--------------------------------------------------------------------===//
// Analyze
Expand Down Expand Up @@ -43,4 +44,5 @@ struct DictionaryCompressionAnalyzeState : public AnalyzeState {
unique_ptr<DictionaryAnalyzeState> analyze_state;
};

} // namespace dictionary
} // namespace duckdb
2 changes: 2 additions & 0 deletions src/include/duckdb/storage/compression/dictionary/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "duckdb/storage/string_uncompressed.hpp"

namespace duckdb {
namespace dictionary {

typedef struct {
uint32_t dict_size;
Expand Down Expand Up @@ -57,4 +58,5 @@ class DictionaryCompressionState : public CompressionState {
virtual void Flush(bool final = false) = 0;
};

} // namespace dictionary
} // namespace duckdb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "duckdb/storage/table/column_data_checkpointer.hpp"

namespace duckdb {
namespace dictionary {

// Dictionary compression uses a combination of bitpacking and a dictionary to compress string segments. The data is
// stored across three buffers: the index buffer, the selection buffer and the dictionary. Firstly the Index buffer
Expand Down Expand Up @@ -59,4 +60,5 @@ struct DictionaryCompressionCompressState : public DictionaryCompressionState {
uint32_t latest_lookup_result;
};

} // namespace dictionary
} // namespace duckdb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "duckdb/storage/compression/dictionary/common.hpp"

namespace duckdb {
namespace dictionary {

//===--------------------------------------------------------------------===//
// Scan
Expand Down Expand Up @@ -47,4 +48,5 @@ struct CompressedStringScanState : public StringScanState {
idx_t block_size;
};

} // namespace dictionary
} // namespace duckdb
2 changes: 2 additions & 0 deletions src/storage/compression/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ add_subdirectory(chimp)
add_subdirectory(alp)
add_subdirectory(roaring)
add_subdirectory(dictionary)
add_subdirectory(dict_fsst)

add_library_unity(
duckdb_storage_compression
Expand All @@ -19,6 +20,7 @@ add_library_unity(
zstd.cpp
alprd.cpp
fsst.cpp
dict_fsst.cpp
empty_validity.cpp)
set(ALL_OBJECT_FILES
${ALL_OBJECT_FILES} $<TARGET_OBJECTS:duckdb_storage_compression>
Expand Down
Loading

0 comments on commit c7b7690

Please sign in to comment.