Skip to content

Commit

Permalink
add split encoding support and tests (#221)
Browse files Browse the repository at this point in the history
* add split encoding support and tests
  • Loading branch information
Moelf authored Feb 26, 2023
1 parent dfce0e4 commit 3c2ad37
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 19 deletions.
4 changes: 2 additions & 2 deletions src/RNTuple/constants.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ const rntuple_col_type_dict = (
Int32,
Int16,
Int8,
UInt32, # SplitIndex64 delta encoding
UInt64, # SplitIndex32 delta encoding
Index32, # split delta encoding
Index64, # split
Float64, # split
Float32, # split
Float16, # split
Expand Down
9 changes: 6 additions & 3 deletions src/RNTuple/fieldcolumn_reading.jl
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,14 @@ function read_field(io, field::RNTupleCardinality{T}, page_list) where T
return res::_field_output_type(field)
end


_field_output_type(::Type{LeafField{T}}) where {T} = T_Reinter{T}
function read_field(io, field::LeafField{T}, page_list) where T
nbits = field.nbits
pages = page_list[field.content_col_idx]
bytes = read_pagedesc(io, pages, nbits)
# handle split encoding within page
split = 14 <= field.type <= 21
bytes = read_pagedesc(io, pages, nbits; split = split)
res = reinterpret(T, bytes)
return res::_field_output_type(field)
end
Expand All @@ -94,8 +97,8 @@ function read_field(io, field::LeafField{Bool}, page_list)
total_num_elements = sum(p.num_elements for p in pages)

# pad to nearest 8*k bytes because each chunk needs to be UInt64
original_bytes = read_pagedesc(io, pages, nbits)
bytes = vcat(original_bytes, zeros(eltype(original_bytes), 8 - rem(total_num_elements, 8)))
bytes = read_pagedesc(io, pages, nbits)
append!(bytes, zeros(eltype(bytes), 8 - rem(total_num_elements, 8)))
chunks = reinterpret(UInt64, bytes)

res = BitVector(undef, total_num_elements)
Expand Down
8 changes: 6 additions & 2 deletions src/RNTuple/fieldcolumn_schema.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,19 @@ end
"""
struct LeafField{T}
content_col_idx::Int
type::Int
nbits::Int
end
Base case of field nesting, this links to a column in the RNTuple by 0-based index.
`T` is the `eltype` of this field which mostly uses Julia native types except for
`Switch`.
The `type` field is the RNTuple spec type number, used to record split encoding.
"""
struct LeafField{T}
content_col_idx::Int
type::Int
nbits::Int
end

Expand All @@ -62,11 +66,11 @@ function _search_col_type(field_id, column_records, col_id::Int...)
if length(col_id) == 2 &&
column_records[col_id[1]].type == 2 &&
column_records[col_id[2]].type == 5
return StringField(LeafField{Int32}(col_id[1], 32), LeafField{Char}(col_id[2], 8))
return StringField(LeafField{Int32}(col_id[1], 2, 32), LeafField{Char}(col_id[2], 5, 8))
elseif length(col_id) == 1
record = column_records[only(col_id)]
LeafType = rntuple_col_type_dict[record.type]
return LeafField{LeafType}(only(col_id), record.nbits)
return LeafField{LeafType}(only(col_id), record.type, record.nbits)
else
error("un-handled base case, report issue to authors")
end
Expand Down
46 changes: 38 additions & 8 deletions src/RNTuple/footer.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,30 @@ end
locator::Locator
end

# https://discourse.julialang.org/t/simd-gather-result-in-slow-down/95161/2
function split4_reinterpret(src::Vector{UInt8})
dst = similar(src)
count = length(src) ÷ 4
res = reinterpret(UInt32, dst)
@inbounds for i = 1:count
Base.Cartesian.@nexprs 4 j -> b_j = UInt32(src[(j-1)*count + i]) << (8*(j-1))
res[i] = (b_1 | b_2) | (b_3 | b_4)
end
return dst
end
function split8_reinterpret(src::Vector{UInt8})
dst = similar(src)
count = length(src) ÷ 8
res = reinterpret(UInt64, dst)
@inbounds for i = 1:count
Base.Cartesian.@nexprs 8 j -> b_j = UInt64(src[(j-1)*count + i]) << (8*(j-1))
res[i] = (b_1 | b_2) | (b_3 | b_4) | (b_5 | b_6) | (b_7 | b_8)
end
return dst
end

"""
read_pagedesc(io, pagedesc::PageDescription, nbits::Int)
read_pagedesc(io, pagedesc::Vector{PageDescription}, nbits::Integer)
Read the decompressed raw bytes given a Page Description. The
`nbits` need to be provided according to the element type of the
Expand All @@ -57,13 +79,21 @@ column since `pagedesc` only contains `num_elements` information.
Boolean values are always stored as bit in RNTuple, so `nbits = 1`.
"""
function read_pagedesc(io, pagedesc::PageDescription, nbits::Integer)
uncomp_size = div(pagedesc.num_elements * nbits, 8, RoundUp) # when nbits == 1 for bits, need RoundUp
return _read_locator(io, pagedesc.locator, uncomp_size)
end
function read_pagedesc(io, pagedescs::Vector, nbits::Integer)
res = read_pagedesc.(Ref(io), pagedescs, nbits)
return reduce(vcat, res)
function read_pagedesc(io, pagedescs::Vector{PageDescription}, nbits::Integer; split=false)
res = mapreduce(vcat, pagedescs) do pagedesc
# when nbits == 1 for bits, need RoundUp
uncomp_size = div(pagedesc.num_elements * nbits, 8, RoundUp)
tmp = _read_locator(io, pagedesc.locator, uncomp_size)
if split && nbits == 32
split4_reinterpret(tmp)
elseif split && nbits == 64
split8_reinterpret(tmp)
else
tmp
end
end

return res
end

struct PageLink end
Expand Down
22 changes: 18 additions & 4 deletions test/rntuple_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -120,21 +120,35 @@ end
length.(t.Muon_charge)
end

@testset "RNTuple Split Encoding" begin
f1 = UnROOT.samplefile("RNTuple/test_ntuple_split_3e4.root")
t = LazyTree(f1, "ntuple")
@test all(==(Int32(0x04030201)), t.one_int32)
@test all(==(0xffeeddcc), reinterpret(UInt32, t.two_uint32))

# 0.099967316
@test reinterpret(UInt32, t.three_vint32[2]) == [0x3dccbbaa]
@test all(reduce(vcat, t.three_vint32) .=== 0.099967316f0)
@test length.(t.three_vint32) == repeat(0:9, 3000)

@test all(==(578437695752307201), t.four_int64)
end

@testset "RNTuple Type stability" begin
f1 = UnROOT.samplefile("RNTuple/test_ntuple_int_5e4.root")
t = LazyTree(f1, "ntuple")

function f1()
function f()
s = 0.0f0
for evt in t
s += evt.one_integers
end
s
end
f2() = sum(t.one_integers)
g() = sum(t.one_integers)

@inferred f1()
@inferred f2()
@inferred f()
@inferred g()
end

@testset "RNTuple Multi-threading" begin
Expand Down
34 changes: 34 additions & 0 deletions test/samples/RNTuple/rntuple_split_3e4.C
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
R__LOAD_LIBRARY(ROOTNTuple)
#include <ROOT/RField.hxx>
#include <ROOT/RNTuple.hxx>
#include <ROOT/RNTupleModel.hxx>
#include <ROOT/RRawFile.hxx>

using RNTupleModel = ROOT::Experimental::RNTupleModel;
using RNTupleWriter = ROOT::Experimental::RNTupleWriter;

void rntuple_split_3e4() {
std::string rootFileName{"test_ntuple_split_3e4.root"};
auto model = RNTupleModel::Create();
auto splitint_field = model->MakeField<int32_t>("one_int32");
auto splitint_field2 = model->MakeField<uint32_t>("two_uint32");
auto splitint_field3 = model->MakeField<std::vector<float>>("three_vint32");
auto splitint_field4 = model->MakeField<int64_t>("four_int64");


auto ntuple = RNTupleWriter::Recreate(std::move(model), "ntuple", rootFileName);
for(auto i=30000; i>0; i--){
// 0x04030201
*splitint_field = 67305985;
// 0xffddccbb
*splitint_field2 = 4293844428;
// 0x3dccbbaa
splitint_field3->emplace_back(0.099967316);
if (i % 10 == 0){
splitint_field3->clear();
}
// 0x0807060504030201
*splitint_field4 = 578437695752307201;
ntuple->Fill();
}
}
Binary file added test/samples/RNTuple/test_ntuple_split_3e4.root
Binary file not shown.

0 comments on commit 3c2ad37

Please sign in to comment.