Skip to content

Commit

Permalink
cleaned up the scan code
Browse files Browse the repository at this point in the history
  • Loading branch information
Tishj committed Dec 12, 2024
1 parent ca65b1a commit b656bde
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,27 @@ struct CompressedStringScanState : public StringScanState {
void ScanToDictionaryVector(ColumnSegment &segment, Vector &result, idx_t result_offset, idx_t start,
idx_t scan_count);

public:
void Initialize(ColumnSegment &segment, bool initialize_dictionary = true);

public:
BufferHandle owned_handle;
optional_ptr<BufferHandle> handle;
buffer_ptr<Vector> dictionary;
idx_t dictionary_size;

bitpacking_width_t current_width;
buffer_ptr<SelectionVector> sel_vec;
idx_t sel_vec_size = 0;

//! Start of the block (pointing to the dictionary_header)
data_ptr_t baseptr;
//! Start of the data (pointing to the start of the selection buffer)
data_ptr_t base_data;
uint32_t *index_buffer_ptr;
uint32_t index_buffer_count;

buffer_ptr<Vector> dictionary;
idx_t dictionary_size;
StringDictionaryContainer dict;
};

} // namespace duckdb
46 changes: 32 additions & 14 deletions src/storage/compression/dictionary/decompression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,42 @@

namespace duckdb {

void CompressedStringScanState::ScanToFlatVector(ColumnSegment &segment, Vector &result, idx_t result_offset,
idx_t start, idx_t scan_count) {
auto baseptr = handle->Ptr() + segment.GetBlockOffset();
auto dict = DictionaryCompression::GetDictionary(segment, *handle);
void CompressedStringScanState::Initialize(ColumnSegment &segment, bool initialize_dictionary) {
baseptr = handle->Ptr() + segment.GetBlockOffset();

// Load header values
auto header_ptr = reinterpret_cast<dictionary_compression_header_t *>(baseptr);
auto index_buffer_offset = Load<uint32_t>(data_ptr_cast(&header_ptr->index_buffer_offset));
auto index_buffer_ptr = reinterpret_cast<uint32_t *>(baseptr + index_buffer_offset);
index_buffer_count = Load<uint32_t>(data_ptr_cast(&header_ptr->index_buffer_count));
current_width = (bitpacking_width_t)(Load<uint32_t>(data_ptr_cast(&header_ptr->bitpacking_width)));
if (segment.GetBlockOffset() + index_buffer_offset + sizeof(uint32_t) * index_buffer_count >
segment.GetBlockManager().GetBlockSize()) {
throw IOException(
"Failed to scan dictionary string - index was out of range. Database file appears to be corrupted.");
}
index_buffer_ptr = reinterpret_cast<uint32_t *>(baseptr + index_buffer_offset);
base_data = data_ptr_cast(baseptr + DictionaryCompression::DICTIONARY_HEADER_SIZE);

if (!initialize_dictionary) {
// Used by fetch, as fetch will never produce a DictionaryVector
return;
}

dict = DictionaryCompression::GetDictionary(segment, *handle);
dictionary = make_buffer<Vector>(segment.type, index_buffer_count);
dictionary_size = index_buffer_count;
auto dict_child_data = FlatVector::GetData<string_t>(*(dictionary));

auto base_data = data_ptr_cast(baseptr + DictionaryCompression::DICTIONARY_HEADER_SIZE);
for (uint32_t i = 0; i < index_buffer_count; i++) {
// NOTE: the passing of dict_child_vector, will not be used, its for big strings
uint16_t str_len = DictionaryCompression::GetStringLength(index_buffer_ptr, i);
dict_child_data[i] = DictionaryCompression::FetchStringFromDict(
segment, dict, baseptr, UnsafeNumericCast<int32_t>(index_buffer_ptr[i]), str_len);
}
}

void CompressedStringScanState::ScanToFlatVector(ColumnSegment &segment, Vector &result, idx_t result_offset,
idx_t start, idx_t scan_count) {
auto result_data = FlatVector::GetData<string_t>(result);

// Handling non-bitpacking-group-aligned start values;
Expand Down Expand Up @@ -44,14 +70,6 @@ void CompressedStringScanState::ScanToFlatVector(ColumnSegment &segment, Vector

void CompressedStringScanState::ScanToDictionaryVector(ColumnSegment &segment, Vector &result, idx_t result_offset,
idx_t start, idx_t scan_count) {
auto baseptr = handle->Ptr() + segment.GetBlockOffset();
auto dict = DictionaryCompression::GetDictionary(segment, *handle);

auto header_ptr = reinterpret_cast<dictionary_compression_header_t *>(baseptr);
auto index_buffer_offset = Load<uint32_t>(data_ptr_cast(&header_ptr->index_buffer_offset));
auto index_buffer_ptr = reinterpret_cast<uint32_t *>(baseptr + index_buffer_offset);

auto base_data = data_ptr_cast(baseptr + DictionaryCompression::DICTIONARY_HEADER_SIZE);
auto result_data = FlatVector::GetData<string_t>(result);

D_ASSERT(start % BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE == 0);
Expand Down
28 changes: 1 addition & 27 deletions src/storage/compression/dictionary_compression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,33 +115,7 @@ void DictionaryCompressionStorage::FinalizeCompress(CompressionState &state_p) {
unique_ptr<SegmentScanState> DictionaryCompressionStorage::StringInitScan(ColumnSegment &segment) {
auto &buffer_manager = BufferManager::GetBufferManager(segment.db);
auto state = make_uniq<CompressedStringScanState>(buffer_manager.Pin(segment.block));
auto baseptr = state->handle->Ptr() + segment.GetBlockOffset();

// Load header values
auto dict = DictionaryCompression::GetDictionary(segment, *state->handle);
auto header_ptr = reinterpret_cast<dictionary_compression_header_t *>(baseptr);
auto index_buffer_offset = Load<uint32_t>(data_ptr_cast(&header_ptr->index_buffer_offset));
auto index_buffer_count = Load<uint32_t>(data_ptr_cast(&header_ptr->index_buffer_count));
state->current_width = (bitpacking_width_t)(Load<uint32_t>(data_ptr_cast(&header_ptr->bitpacking_width)));
if (segment.GetBlockOffset() + index_buffer_offset + sizeof(uint32_t) * index_buffer_count >
segment.GetBlockManager().GetBlockSize()) {
throw IOException(
"Failed to scan dictionary string - index was out of range. Database file appears to be corrupted.");
}

auto index_buffer_ptr = reinterpret_cast<uint32_t *>(baseptr + index_buffer_offset);

state->dictionary = make_buffer<Vector>(segment.type, index_buffer_count);
state->dictionary_size = index_buffer_count;
auto dict_child_data = FlatVector::GetData<string_t>(*(state->dictionary));

for (uint32_t i = 0; i < index_buffer_count; i++) {
// NOTE: the passing of dict_child_vector, will not be used, its for big strings
uint16_t str_len = DictionaryCompression::GetStringLength(index_buffer_ptr, i);
dict_child_data[i] = DictionaryCompression::FetchStringFromDict(
segment, dict, baseptr, UnsafeNumericCast<int32_t>(index_buffer_ptr[i]), str_len);
}

state->Initialize(segment, true);
return std::move(state);
}

Expand Down

0 comments on commit b656bde

Please sign in to comment.