diff --git a/src/storage/compression/fsst.cpp b/src/storage/compression/fsst.cpp index bad017835047..b3ad41cca960 100644 --- a/src/storage/compression/fsst.cpp +++ b/src/storage/compression/fsst.cpp @@ -12,6 +12,39 @@ #include "fsst.h" #include "miniz_wrapper.hpp" +/* +Data layout per segment: ++------------------------------------------------+ +| Header | +| +----------------------------------------+ | +| | fsst_compression_header_t header | | +| +----------------------------------------+ | +| | ++------------------------------------------------+ +| Index Buffer | +| +------------------------------------+ | +| | uint16_t index_buffer_idx[] | | +| +------------------------------------+ | +| tuple index -> index buffer idx | +| (bitpacked) | +| | ++--------------------------------------------+ +| Symbol Table | +| +------------------------------------+ | +| | duckdb_fsst_decoder_t table | | +| +------------------------------------+ | +| symbol table used to compress the strings | +| | ++--------------------------------------------+ +| Dictionary | +| +------------------------------------+ | +| | uint8_t *raw_string_data | | +| +------------------------------------+ | +| the string data without lengths | +| | ++--------------------------------------------+ +*/ + namespace duckdb { struct FSSTScanState; @@ -79,12 +112,10 @@ struct FSSTAnalyzeState : public AnalyzeState { } ~FSSTAnalyzeState() override { - if (fsst_encoder) { - duckdb_fsst_destroy(fsst_encoder); - } + duckdb_fsst_destroy(fsst_encoder.get()); } - duckdb_fsst_encoder_t *fsst_encoder = nullptr; + optional_ptr fsst_encoder; idx_t count; StringHeap fsst_string_heap; @@ -172,10 +203,10 @@ idx_t FSSTStorage::StringFinalAnalyze(AnalyzeState &state_p) { // TODO: do we really need to encode to get a size estimate? auto compressed_ptrs = vector(string_count, nullptr); auto compressed_sizes = vector(string_count, 0); - unique_ptr compressed_buffer(new unsigned char[output_buffer_size]); + auto compressed_buffer = make_unsafe_uniq_array(output_buffer_size); auto res = - duckdb_fsst_compress(state.fsst_encoder, string_count, &fsst_string_sizes[0], &fsst_string_ptrs[0], + duckdb_fsst_compress(state.fsst_encoder.get(), string_count, &fsst_string_sizes[0], &fsst_string_ptrs[0], output_buffer_size, compressed_buffer.get(), &compressed_sizes[0], &compressed_ptrs[0]); if (string_count != res) { @@ -215,9 +246,7 @@ class FSSTCompressionState : public CompressionState { } ~FSSTCompressionState() override { - if (fsst_encoder) { - duckdb_fsst_destroy(fsst_encoder); - } + duckdb_fsst_destroy(fsst_encoder.get()); } void Reset() { @@ -354,11 +383,7 @@ class FSSTCompressionState : public CompressionState { current_segment->count, current_width); // Write the fsst symbol table or nothing - if (fsst_encoder != nullptr) { - memcpy(base_ptr + symbol_table_offset, &fsst_serialized_symbol_table[0], fsst_serialized_symbol_table_size); - } else { - memset(base_ptr + symbol_table_offset, 0, fsst_serialized_symbol_table_size); - } + memcpy(base_ptr + symbol_table_offset, &fsst_serialized_symbol_table[0], fsst_serialized_symbol_table_size); Store(NumericCast(symbol_table_offset), data_ptr_cast(&header_ptr->fsst_symbol_table_offset)); @@ -399,7 +424,7 @@ class FSSTCompressionState : public CompressionState { bitpacking_width_t current_width; idx_t last_fitting_size; - duckdb_fsst_encoder_t *fsst_encoder = nullptr; + optional_ptr fsst_encoder; unsigned char fsst_serialized_symbol_table[sizeof(duckdb_fsst_decoder_t)]; size_t fsst_serialized_symbol_table_size = sizeof(duckdb_fsst_decoder_t); }; @@ -413,9 +438,10 @@ unique_ptr FSSTStorage::InitCompression(ColumnDataCheckpointer throw InternalException("No encoder found during FSST compression"); } - compression_state->fsst_encoder = analyze_state.fsst_encoder; + compression_state->fsst_encoder = std::move(analyze_state.fsst_encoder); compression_state->fsst_serialized_symbol_table_size = - duckdb_fsst_export(compression_state->fsst_encoder, &compression_state->fsst_serialized_symbol_table[0]); + duckdb_fsst_export(compression_state->fsst_encoder.get(), &compression_state->fsst_serialized_symbol_table[0]); + // FIXME: move of 'optional_ptr' should already set to null, no??? analyze_state.fsst_encoder = nullptr; return std::move(compression_state); @@ -470,14 +496,14 @@ void FSSTStorage::Compress(CompressionState &state_p, Vector &scan_vector, idx_t vector compress_buffer(compress_buffer_size, 0); auto res = duckdb_fsst_compress( - state.fsst_encoder, /* IN: encoder obtained from duckdb_fsst_create(). */ - total_count, /* IN: number of strings in batch to compress. */ - &sizes_in[0], /* IN: byte-lengths of the inputs */ - &strings_in[0], /* IN: input string start pointers. */ - compress_buffer_size, /* IN: byte-length of output buffer. */ - &compress_buffer[0], /* OUT: memory buffer to put the compressed strings in (one after the other). */ - &sizes_out[0], /* OUT: byte-lengths of the compressed strings. */ - &strings_out[0] /* OUT: output string start pointers. Will all point into [output,output+size). */ + state.fsst_encoder.get(), /* IN: encoder obtained from duckdb_fsst_create(). */ + total_count, /* IN: number of strings in batch to compress. */ + sizes_in.data(), /* IN: byte-lengths of the inputs */ + strings_in.data(), /* IN: input string start pointers. */ + compress_buffer_size, /* IN: byte-length of output buffer. */ + compress_buffer.data(), /* OUT: memory buffer to put the compressed strings in (one after the other). */ + sizes_out.data(), /* OUT: byte-lengths of the compressed strings. */ + strings_out.data() /* OUT: output string start pointers. Will all point into [output,output+size). */ ); if (res != total_count) {