Skip to content

Commit

Permalink
Fix issues with % in v1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Aug 12, 2018
1 parent 07a1e27 commit 45fa89d
Showing 1 changed file with 78 additions and 59 deletions.
137 changes: 78 additions & 59 deletions src/MurmurHash3.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ non-native version will be less than optimal.
module MurmurHash3
export mmhash128_a, mmhash128_u, mmhash128_c, mmhash32

u8(val) = val%UInt8
u32(val) = val%UInt32
u64(val) = val%UInt64

@inline rotl(x::Unsigned, r) = (x << r) | (x >>> (sizeof(typeof(x))*8 - r))

@inline xor33(k::UInt64) = xor(k, k >>> 33)
Expand Down Expand Up @@ -47,8 +51,8 @@ end
@inline mhtail2(h2, k2) = xor(h2, rotl33(k2 * c2) * c1)

@inline function mhfin(len, h1, h2)
h1 = xor(h1, len%UInt64)
h2 = xor(h2, len%UInt64)
h1 = xor(h1, u64(len))
h2 = xor(h2, u64(len))

h1 += h2
h2 += h1
Expand All @@ -62,66 +66,81 @@ end

#---------------------------------------------------------------------------

up8(val) = u32(val) << 8
up16(val) = u32(val) << 16
up24(val) = u32(val) << 24
up32(val) = u64(val) << 32
up40(val) = u64(val) << 40
up48(val) = u64(val) << 48
up56(val) = u64(val) << 56

dn6(val) = u8(val >>> 6)
dn12(val) = u8(val >>> 12)
dn18(val) = u8(val >>> 18)

msk6(val) = u8(val & 0x3f)

# Support functions for UTF-8 handling
@inline get_utf8_2(ch) =
(0xc0 | (ch >>> 6)%UInt8, 0x80 | (ch & 0x3f)%UInt8)
@inline get_utf8_3(ch) =
(0xe0 | (ch >>> 12)%UInt8, 0x80 | ((ch >>> 6) & 0x3f)%UInt8, 0x80 | (ch & 0x3f)%UInt8)
@inline get_utf8_4(ch) =
(0xf0 | (ch >>> 18)%UInt8, 0x80 | ((ch >>> 12) & 0x3f)%UInt8,
0x80 | ((ch >>> 6) & 0x3f)%UInt8, 0x80 | (ch & 0x3f)%UInt8)
@inline get_utf8_2(ch) = (0xc0 | dn6(ch), 0x80 | msk6(ch))
@inline get_utf8_3(ch) = (0xe0 | dn12(ch), 0x80 | msk6(dn6(ch)), 0x80 | msk6(ch))
@inline get_utf8_4(ch) = (0xf0 | dn18(ch), 0x80 | msk6(dn12(ch)),
0x80 | msk6(dn6(ch8)), 0x80 | msk6(ch))

# Optimized in-place conversion to UTF-8 for hashing compatibly with isequal / String
@inline shift_n(v, n) = v%UInt64 << ((n & 7)%UInt<<3)
@inline shift_n(v, n) = u64(v) << (((n & 7)%UInt)<<3)

# if cnt == 0 - 4, bytes must fit in k1
# cnt between 5 - 8, may overflow into k2
# if l == 8 - 12, bytes must fit in k2
# cnt between 12 - 15, may overflow into k3

mergebytes(b1, b2) = b1 | up8(b2)
mergebytes(b1, b2, b3) = b1 | up8(b2) | up16(b3)
mergebytes(b1, b2, b3, b4) = b1 | up8(b2) | up16(b3) | up24(b4)

@inline function add_utf8(cnt, ch, k1::UInt64)
if ch <= 0x7f
cnt + 1, k1 | shift_n(ch, cnt)
elseif ch <= 0x7ff
b1, b2 = get_utf8_2(ch)
cnt + 2, k1 | shift_n(b1 | b2%UInt32<<8, cnt)
cnt + 2, k1 | shift_n(mergebytes(b1, b2), cnt)
elseif ch <= 0xffff
b1, b2, b3 = get_utf8_3(ch)
cnt + 3, k1 | shift_n(b1 | b2%UInt32<<8 | b3%UInt32<<16, cnt)
cnt + 3, k1 | shift_n(mergebytes(b1, b2, b3), cnt)
else
b1, b2, b3, b4 = get_utf8_4(ch)
cnt + 4, k1 | shift_n(b1 | b2%UInt32<<8 | b3%UInt32<<16 | b4%UInt32<<24, cnt)
cnt + 4, k1 | shift_n(mergebytes(b1, b2, b3, b4), cnt)
end
end

@inline function add_utf8_split(cnt, ch, k1::UInt64)
if ch <= 0x7f
cnt + 1, k1 | shift_n(ch, cnt), 0%UInt64
cnt + 1, k1 | shift_n(ch, cnt), u64(0)
elseif ch <= 0x7ff
b1, b2 = get_utf8_2(ch)
if (cnt & 7) == 7
cnt + 2, k1 | b1%UInt64<<56, b2%UInt64
cnt + 2, k1 | up56(b1), u64(b2)
else
cnt + 2, k1 | shift_n(b1 | b2%UInt32<<8, cnt), 0%UInt64
cnt + 2, k1 | shift_n(b1 | up8(b2), cnt), u64(0)
end
elseif ch <= 0xffff
b1, b2, b3 = get_utf8_3(ch)
if (cnt & 7) == 5
cnt + 3, k1 | b1%UInt64<<40 | b2%UInt64<<48 | b3%UInt64<<56, 0%UInt64
cnt + 3, k1 | up40(b1) | up48(b2) | up56(b3), u64(0)
elseif (cnt & 7) == 6
cnt + 3, k1 | b1%UInt64<<48 | b2%UInt64<<56, b3%UInt64
cnt + 3, k1 | up48(b1) | up56(b2), u64(b3)
else
cnt + 3, k1 | b1%UInt64<<56, b2 | b3%UInt64<<8
cnt + 3, k1 | up56(b1), u64(b2) | up8(b3)
end
else
# This will always go over, may be 1, 2, 3 bytes in second word
b1, b2, b3, b4 = get_utf8_4(ch)
if (cnt & 7) == 5
cnt + 4, k1 | b1%UInt64<<40 | b2%UInt64<<48 | b3%UInt64<<56, b4%UInt64
cnt + 4, k1 | up40(b1) | up48(b2) | up56(b3), u64(b4)
elseif (cnt & 7) == 6
cnt + 4, k1 | b1%UInt64<<48 | b2%UInt64<<56, b3 | b4%UInt64<<8
cnt + 4, k1 | up48(b1) | up56(b2), b3 | up8(b4)
else
cnt + 4, k1 | b1%UInt64<<56, b2 | b3%UInt64<<8 | b4%UInt64<<16
cnt + 4, k1 | up56(b1), u64(b2) | up8(b3) | up16(b4)
end
end
end
Expand All @@ -130,23 +149,23 @@ end

# AbstractString MurmurHash3, converts to UTF-8 on the fly
function mmhash128_8_c(str::AbstractString, seed::UInt32)
k1 = k2 = 0%UInt64
h1 = h2 = seed%UInt64
k1 = k2 = u64(0)
h1 = h2 = u64(seed)
cnt = len = 0
@inbounds for ch in str
if cnt < 5
cnt, k1 = add_utf8(cnt, ch%UInt32, k1)
cnt, k1 = add_utf8(cnt, u32(ch), k1)
elseif cnt < 8
cnt, k1, k2 = add_utf8_split(cnt, ch%UInt32, k1)
cnt, k1, k2 = add_utf8_split(cnt, u32(ch), k1)
elseif cnt < 13
cnt, k2 = add_utf8(cnt, ch%UInt32, k2)
cnt, k2 = add_utf8(cnt, u32(ch), k2)
else
cnt, k2, k3 = add_utf8_split(cnt, ch%UInt32, k2)
cnt, k2, k3 = add_utf8_split(cnt, u32(ch), k2)
# When k1 and k2 are full, then hash another block
if cnt > 15
h1, h2 = mhblock(h1, h2, k1, k2)
k1 = k3
k2 = 0%UInt64
k2 = u64(0)
len += 16
cnt &= 15
end
Expand All @@ -166,10 +185,10 @@ end
# the start of the strings are 8-byte aligned, and it is safe to access a full
# 8-byte chunk always at the end (simply masking off the remaining 1-7 bytes)

@inline mask_load(pnt, left) = unsafe_load(pnt) & ((1%UInt64 << ((left & 7) << 3)) - 0x1)
@inline mask_load(pnt, left) = unsafe_load(pnt) & ((UInt64(1) << ((left & 7) << 3)) - 0x1)

function mmhash128_8_a(len::Integer, pnt::Ptr, seed::UInt32)
pnt8, h1, h2 = mhbody(len >>> 4, reinterpret(Ptr{UInt64}, pnt), seed%UInt64, seed%UInt64)
pnt8, h1, h2 = mhbody(len >>> 4, reinterpret(Ptr{UInt64}, pnt), u64(seed), u64(seed))
if (left = len & 15) > 0
h1 = mhtail1(h1, left < 8 ? mask_load(pnt8, left) : unsafe_load(pnt8))
left > 8 && (h2 = mhtail2(h2, mask_load(pnt8 + 8, left)))
Expand All @@ -178,9 +197,9 @@ function mmhash128_8_a(len::Integer, pnt::Ptr, seed::UInt32)
end

function mmhash128_8_a(seed::Integer)
h1 = fmix(2*(seed%UInt64))
h2 = fmix(3*(seed%UInt64))
h1 + h2, h1 + 2*h2
h1 = fmix(2 * u64(seed))
h2 = fmix(3 * u64(seed))
h1 + h2, h1 + 2 * h2
end

#----------------------------------------------------------------------------
Expand All @@ -196,13 +215,13 @@ end
function mmhash128_8_u(len::Integer, unaligned_pnt::Ptr, seed::UInt32)
# Should optimize handling of short (< 16 byte) unaligned strings
ulp = reinterpret(UInt, unaligned_pnt)
pnt = reinterpret(Ptr{UInt64}, ulp & ~(7%UInt64))
fin = reinterpret(Ptr{UInt64}, (ulp + len + 0x7) & ~(7%UInt64)) - 8
shft = (ulp & 7%UInt)<<3
pnt = reinterpret(Ptr{UInt64}, ulp & ~u64(7))
fin = reinterpret(Ptr{UInt64}, (ulp + len + 0x7) & ~u64(7)) - 8
shft = (ulp & u64(7))<<3
# println("_mmhash128_8_u($len, $unaligned_pnt, $seed) => $pnt, $fin")
h1 = h2 = seed%UInt64
h1 = h2 = u64(seed)
k1 = unsafe_load(pnt) # Pick up first 1-7 bytes
k2 = 0%UInt64
k2 = u64(0)
while pnt < fin
k1, k2, k3 = shift_mix(shft, k1, unsafe_load(pnt += 8), unsafe_load(pnt += 8))
# print(" pnt=$pnt, k1=0x$(outhex(k1)), k2=0x$(outhex(k2))")
Expand Down Expand Up @@ -251,14 +270,14 @@ end
function mmhash32(len, pnt, seed::UInt32)
pnt, h1 = mhbody(len >>> 2, reinterpret(Ptr{UInt32}, pnt), seed)
(len & 3) == 0 || (h1 = xor(h1, rotl15(unsafe_load(pnt)) * d1) * d2)
fmix(xor(h1, len%UInt32))
fmix(xor(h1, u32(len)))
end

@inline function mhfin(len, h1, h2, h3, h4)
h1 = xor(h1, len%UInt32)
h2 = xor(h2, len%UInt32)
h3 = xor(h3, len%UInt32)
h4 = xor(h4, len%UInt32)
h1 = xor(h1, u32(len))
h2 = xor(h2, u32(len))
h3 = xor(h3, u32(len))
h4 = xor(h4, u32(len))

h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1

Expand All @@ -269,7 +288,7 @@ end

h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1

h2%UInt64 << 32 | h1, h4%UInt64 << 32 | h3
up32(h2) | h1, up32(h4) | h3
end

#-----------------------------------------------------------------------------
Expand Down Expand Up @@ -315,51 +334,51 @@ function mmhash128_4(len, pnt, seed::UInt32)
mhfin(len, h1, h2, h3, h4)
end

@inline shift_n_32(v, n) = v%UInt32 << ((n & 7)%UInt<<3)
@inline shift_n_32(v, n) = u32(v) << (((n & 7)%UInt)<<3)

@inline function get_utf8(cnt, ch)
if ch <= 0x7f
cnt + 1, ch%UInt32
cnt + 1, u32(ch)
elseif ch <= 0x7ff
b1, b2 = get_utf8_2(ch)
cnt + 2, b1 | b2%UInt32<<8
cnt + 2, mergebytes(b1, b2)
elseif ch <= 0xffff
b1, b2, b3 = get_utf8_3(ch)
cnt + 3, b1 | b2%UInt32<<8 | b3%UInt32<<16
cnt + 3, mergebytes(b1, b2, b3)
else
b1, b2, b3, b4 = get_utf8_4(ch)
cnt + 4, b1 | b2%UInt32<<8 | b3%UInt32<<16 | b4%UInt32<<24
cnt + 4, mergebytes(b1, b2, b3, b4)
end
end

@inline function add_utf8_split(cnt, ch, k1::UInt32)
if ch <= 0x7f
cnt + 1, k1 | shift_n_32(ch, cnt), 0%UInt32
cnt + 1, k1 | shift_n_32(ch, cnt), u32(0)
elseif ch <= 0x7ff
b1, b2 = get_utf8_2(ch)
if (cnt & 7) == 3
cnt + 2, k1 | b1%UInt32<<24, b2%UInt32
cnt + 2, k1 | up24(b1), u32(b2)
else
cnt + 2, k1 | shift_n_32(b1 | b2%UInt32<<8, cnt), 0%UInt32
cnt + 2, k1 | shift_n_32(b1 | up8(b2), cnt), u32(0)
end
elseif ch <= 0xffff
b1, b2, b3 = get_utf8_3(ch)
if (cnt & 7) == 1
cnt + 3, k1 | b1%UInt32<<8 | b2%UInt32<<16 | b3%UInt32<<24, 0%UInt32
cnt + 3, k1 | up8(b1) | up16(b2) | up24(b3), u32(0)
elseif (cnt & 7) == 2
cnt + 3, k1 | b1%UInt32<<16 | b2%UInt32<<24, b3%UInt32
cnt + 3, mergebytes(k1, b1, b2), u32(b3)
else
cnt + 3, k1 | b1%UInt32<<24, b2 | b3%UInt32<<8
cnt + 3, k1 | up24(b1), b2 | up8(b3)
end
else
# This will always go over, may be 1, 2, 3 bytes in second word
b1, b2, b3, b4 = get_utf8_4(ch)
if (cnt & 7) == 1
cnt + 4, k1 | b1%UInt32<<8 | b2%UInt32<<16 | b3%UInt32<<24, b4%UInt32
cnt + 4, mergebytes(k1, b1, b2, b3), u32(b4)
elseif (cnt & 7) == 2
cnt + 4, k1 | b1%UInt32<<16 | b2%UInt32<<24, b3 | b4%UInt32<<8
cnt + 4, mergebytes(k1, b1, b2), b3 | up8(b4)
else
cnt + 4, k1 | b1%UInt32<<24, b2 | b3%UInt32<<8 | b4%UInt32<<16
cnt + 4, k1 | up24(b1), mergebytes(b2, b3, b4)
end
end
end
Expand Down

0 comments on commit 45fa89d

Please sign in to comment.