Fix issues with % in v1.0

JuliaString · Aug 12, 2018 · 45fa89d · 45fa89d
1 parent 07a1e27
commit 45fa89d
Showing 1 changed file with 78 additions and 59 deletions.
diff --git a/src/MurmurHash3.jl b/src/MurmurHash3.jl
@@ -13,6 +13,10 @@ non-native version will be less than optimal.
 module MurmurHash3
 export mmhash128_a, mmhash128_u, mmhash128_c, mmhash32
 
+u8(val)  = val%UInt8
+u32(val) = val%UInt32
+u64(val) = val%UInt64
+
 @inline rotl(x::Unsigned, r) = (x << r) | (x >>> (sizeof(typeof(x))*8 - r))
 
 @inline xor33(k::UInt64) = xor(k, k >>> 33)
@@ -47,8 +51,8 @@ end
 @inline mhtail2(h2, k2) = xor(h2, rotl33(k2 * c2) * c1)
 
 @inline function mhfin(len, h1, h2)
-    h1 = xor(h1, len%UInt64)
-    h2 = xor(h2, len%UInt64)
+    h1 = xor(h1, u64(len))
+    h2 = xor(h2, u64(len))
 
     h1 += h2
     h2 += h1
@@ -62,66 +66,81 @@ end
 
 #---------------------------------------------------------------------------
 
+up8(val)  = u32(val) << 8
+up16(val) = u32(val) << 16
+up24(val) = u32(val) << 24
+up32(val) = u64(val) << 32
+up40(val) = u64(val) << 40
+up48(val) = u64(val) << 48
+up56(val) = u64(val) << 56
+
+dn6(val) = u8(val >>> 6)
+dn12(val) = u8(val >>> 12)
+dn18(val) = u8(val >>> 18)
+
+msk6(val) = u8(val & 0x3f)
+
 # Support functions for UTF-8 handling
-@inline get_utf8_2(ch) =
-    (0xc0 | (ch >>> 6)%UInt8, 0x80 | (ch & 0x3f)%UInt8)
-@inline get_utf8_3(ch) =
-    (0xe0 | (ch >>> 12)%UInt8, 0x80 | ((ch >>> 6) & 0x3f)%UInt8, 0x80 | (ch & 0x3f)%UInt8)
-@inline get_utf8_4(ch) =
-    (0xf0 | (ch >>>  18)%UInt8, 0x80 | ((ch >>> 12) & 0x3f)%UInt8,
-     0x80 | ((ch >>>  6) & 0x3f)%UInt8, 0x80 | (ch & 0x3f)%UInt8)
+@inline get_utf8_2(ch) = (0xc0 | dn6(ch),  0x80 | msk6(ch))
+@inline get_utf8_3(ch) = (0xe0 | dn12(ch), 0x80 | msk6(dn6(ch)), 0x80 | msk6(ch))
+@inline get_utf8_4(ch) = (0xf0 | dn18(ch), 0x80 | msk6(dn12(ch)),
+                          0x80 | msk6(dn6(ch8)), 0x80 | msk6(ch))
 
 # Optimized in-place conversion to UTF-8 for hashing compatibly with isequal / String
-@inline shift_n(v, n) = v%UInt64 << ((n & 7)%UInt<<3)
+@inline shift_n(v, n) = u64(v) << (((n & 7)%UInt)<<3)
 
 # if cnt == 0 - 4, bytes must fit in k1
 # cnt between 5 - 8, may overflow into k2
 # if l == 8 - 12,  bytes must fit in k2
 # cnt between 12 - 15, may overflow into k3
 
+mergebytes(b1, b2)         = b1 | up8(b2)
+mergebytes(b1, b2, b3)     = b1 | up8(b2) | up16(b3)
+mergebytes(b1, b2, b3, b4) = b1 | up8(b2) | up16(b3) | up24(b4)
+
 @inline function add_utf8(cnt, ch, k1::UInt64)
     if ch <= 0x7f
         cnt + 1, k1 | shift_n(ch, cnt)
     elseif ch <= 0x7ff
         b1, b2 = get_utf8_2(ch)
-        cnt + 2, k1 | shift_n(b1 | b2%UInt32<<8, cnt)
+        cnt + 2, k1 | shift_n(mergebytes(b1, b2), cnt)
     elseif ch <= 0xffff
         b1, b2, b3 = get_utf8_3(ch)
-        cnt + 3, k1 | shift_n(b1 | b2%UInt32<<8 | b3%UInt32<<16, cnt)
+        cnt + 3, k1 | shift_n(mergebytes(b1, b2, b3), cnt)
     else
         b1, b2, b3, b4 = get_utf8_4(ch)
-        cnt + 4, k1 | shift_n(b1 | b2%UInt32<<8 | b3%UInt32<<16 | b4%UInt32<<24, cnt)
+        cnt + 4, k1 | shift_n(mergebytes(b1, b2, b3, b4), cnt)
     end
 end
 
 @inline function add_utf8_split(cnt, ch, k1::UInt64)
     if ch <= 0x7f
-        cnt + 1, k1 | shift_n(ch, cnt), 0%UInt64
+        cnt + 1, k1 | shift_n(ch, cnt), u64(0)
     elseif ch <= 0x7ff
         b1, b2 = get_utf8_2(ch)
         if (cnt & 7) == 7
-            cnt + 2, k1 | b1%UInt64<<56, b2%UInt64
+            cnt + 2, k1 | up56(b1), u64(b2)
         else
-            cnt + 2, k1 | shift_n(b1 | b2%UInt32<<8, cnt), 0%UInt64
+            cnt + 2, k1 | shift_n(b1 | up8(b2), cnt), u64(0)
         end
     elseif ch <= 0xffff
         b1, b2, b3 = get_utf8_3(ch)
         if (cnt & 7) == 5
-            cnt + 3, k1 | b1%UInt64<<40 | b2%UInt64<<48 | b3%UInt64<<56, 0%UInt64
+            cnt + 3, k1 | up40(b1) | up48(b2) | up56(b3), u64(0)
         elseif (cnt & 7) == 6
-            cnt + 3, k1 | b1%UInt64<<48 | b2%UInt64<<56, b3%UInt64
+            cnt + 3, k1 | up48(b1) | up56(b2), u64(b3)
         else
-            cnt + 3, k1 | b1%UInt64<<56, b2 | b3%UInt64<<8
+            cnt + 3, k1 | up56(b1), u64(b2) | up8(b3)
         end
     else
         # This will always go over, may be 1, 2, 3 bytes in second word
         b1, b2, b3, b4 = get_utf8_4(ch)
         if (cnt & 7) == 5
-            cnt + 4, k1 | b1%UInt64<<40 | b2%UInt64<<48 | b3%UInt64<<56, b4%UInt64
+            cnt + 4, k1 | up40(b1) | up48(b2) | up56(b3), u64(b4)
         elseif (cnt & 7) == 6
-            cnt + 4, k1 | b1%UInt64<<48 | b2%UInt64<<56, b3 | b4%UInt64<<8
+            cnt + 4, k1 | up48(b1) | up56(b2), b3 | up8(b4)
         else
-            cnt + 4, k1 | b1%UInt64<<56, b2 | b3%UInt64<<8 | b4%UInt64<<16
+            cnt + 4, k1 | up56(b1), u64(b2) | up8(b3) | up16(b4)
         end
     end
 end
@@ -130,23 +149,23 @@ end
 
 # AbstractString MurmurHash3, converts to UTF-8 on the fly
 function mmhash128_8_c(str::AbstractString, seed::UInt32)
-    k1 = k2 = 0%UInt64
-    h1 = h2 = seed%UInt64
+    k1 = k2 = u64(0)
+    h1 = h2 = u64(seed)
     cnt = len = 0
     @inbounds for ch in str
         if cnt < 5
-            cnt, k1 = add_utf8(cnt, ch%UInt32, k1)
+            cnt, k1 = add_utf8(cnt, u32(ch), k1)
         elseif cnt < 8
-            cnt, k1, k2 = add_utf8_split(cnt, ch%UInt32, k1)
+            cnt, k1, k2 = add_utf8_split(cnt, u32(ch), k1)
         elseif cnt < 13
-            cnt, k2 = add_utf8(cnt, ch%UInt32, k2)
+            cnt, k2 = add_utf8(cnt, u32(ch), k2)
         else
-            cnt, k2, k3 = add_utf8_split(cnt, ch%UInt32, k2)
+            cnt, k2, k3 = add_utf8_split(cnt, u32(ch), k2)
             # When k1 and k2 are full, then hash another block
             if cnt > 15
                 h1, h2 = mhblock(h1, h2, k1, k2)
                 k1 = k3
-                k2 = 0%UInt64
+                k2 = u64(0)
                 len += 16
                 cnt &= 15
             end
@@ -166,10 +185,10 @@ end
 # the start of the strings are 8-byte aligned, and it is safe to access a full
 # 8-byte chunk always at the end (simply masking off the remaining 1-7 bytes)
 
-@inline mask_load(pnt, left) = unsafe_load(pnt) & ((1%UInt64 << ((left & 7) << 3)) - 0x1)
+@inline mask_load(pnt, left) = unsafe_load(pnt) & ((UInt64(1) << ((left & 7) << 3)) - 0x1)
 
 function mmhash128_8_a(len::Integer, pnt::Ptr, seed::UInt32)
-    pnt8, h1, h2 = mhbody(len >>> 4, reinterpret(Ptr{UInt64}, pnt), seed%UInt64, seed%UInt64)
+    pnt8, h1, h2 = mhbody(len >>> 4, reinterpret(Ptr{UInt64}, pnt), u64(seed), u64(seed))
     if (left = len & 15) > 0
         h1 = mhtail1(h1, left < 8 ? mask_load(pnt8, left) : unsafe_load(pnt8))
         left > 8 && (h2 = mhtail2(h2, mask_load(pnt8 + 8, left)))
@@ -178,9 +197,9 @@ function mmhash128_8_a(len::Integer, pnt::Ptr, seed::UInt32)
 end
 
 function mmhash128_8_a(seed::Integer)
-    h1 = fmix(2*(seed%UInt64))
-    h2 = fmix(3*(seed%UInt64))
-    h1 + h2, h1 + 2*h2
+    h1 = fmix(2 * u64(seed))
+    h2 = fmix(3 * u64(seed))
+    h1 + h2, h1 + 2 * h2
 end
 
 #----------------------------------------------------------------------------
@@ -196,13 +215,13 @@ end
 function mmhash128_8_u(len::Integer, unaligned_pnt::Ptr, seed::UInt32)
     # Should optimize handling of short (< 16 byte) unaligned strings
     ulp = reinterpret(UInt, unaligned_pnt)
-    pnt = reinterpret(Ptr{UInt64}, ulp & ~(7%UInt64))
-    fin = reinterpret(Ptr{UInt64}, (ulp + len + 0x7) & ~(7%UInt64)) - 8
-    shft = (ulp & 7%UInt)<<3
+    pnt = reinterpret(Ptr{UInt64}, ulp & ~u64(7))
+    fin = reinterpret(Ptr{UInt64}, (ulp + len + 0x7) & ~u64(7)) - 8
+    shft = (ulp & u64(7))<<3
     # println("_mmhash128_8_u($len, $unaligned_pnt, $seed) => $pnt, $fin")
-    h1 = h2 = seed%UInt64
+    h1 = h2 = u64(seed)
     k1 = unsafe_load(pnt) # Pick up first 1-7 bytes
-    k2 = 0%UInt64
+    k2 = u64(0)
     while pnt < fin
         k1, k2, k3 = shift_mix(shft, k1, unsafe_load(pnt += 8), unsafe_load(pnt += 8))
         # print(" pnt=$pnt, k1=0x$(outhex(k1)), k2=0x$(outhex(k2))")
@@ -251,14 +270,14 @@ end
 function mmhash32(len, pnt, seed::UInt32)
     pnt, h1 = mhbody(len >>> 2, reinterpret(Ptr{UInt32}, pnt), seed)
     (len & 3) == 0 || (h1 = xor(h1, rotl15(unsafe_load(pnt)) * d1) * d2)
-    fmix(xor(h1, len%UInt32))
+    fmix(xor(h1, u32(len)))
 end
 
 @inline function mhfin(len, h1, h2, h3, h4)
-    h1 = xor(h1, len%UInt32)
-    h2 = xor(h2, len%UInt32)
-    h3 = xor(h3, len%UInt32)
-    h4 = xor(h4, len%UInt32)
+    h1 = xor(h1, u32(len))
+    h2 = xor(h2, u32(len))
+    h3 = xor(h3, u32(len))
+    h4 = xor(h4, u32(len))
 
     h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1
 
@@ -269,7 +288,7 @@ end
 
     h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1
 
-    h2%UInt64 << 32 | h1, h4%UInt64 << 32 | h3
+    up32(h2) | h1, up32(h4) | h3
 end
 
 #-----------------------------------------------------------------------------
@@ -315,51 +334,51 @@ function mmhash128_4(len, pnt, seed::UInt32)
     mhfin(len, h1, h2, h3, h4)
 end
 
-@inline shift_n_32(v, n) = v%UInt32 << ((n & 7)%UInt<<3)
+@inline shift_n_32(v, n) = u32(v) << (((n & 7)%UInt)<<3)
 
 @inline function get_utf8(cnt, ch)
     if ch <= 0x7f
-        cnt + 1, ch%UInt32
+        cnt + 1, u32(ch)
     elseif ch <= 0x7ff
         b1, b2 = get_utf8_2(ch)
-        cnt + 2, b1 | b2%UInt32<<8
+        cnt + 2, mergebytes(b1, b2)
     elseif ch <= 0xffff
         b1, b2, b3 = get_utf8_3(ch)
-        cnt + 3, b1 | b2%UInt32<<8 | b3%UInt32<<16
+        cnt + 3, mergebytes(b1, b2, b3)
     else
         b1, b2, b3, b4 = get_utf8_4(ch)
-        cnt + 4, b1 | b2%UInt32<<8 | b3%UInt32<<16 | b4%UInt32<<24
+        cnt + 4, mergebytes(b1, b2, b3, b4)
     end
 end
 
 @inline function add_utf8_split(cnt, ch, k1::UInt32)
     if ch <= 0x7f
-        cnt + 1, k1 | shift_n_32(ch, cnt), 0%UInt32
+        cnt + 1, k1 | shift_n_32(ch, cnt), u32(0)
     elseif ch <= 0x7ff
         b1, b2 = get_utf8_2(ch)
         if (cnt & 7) == 3
-            cnt + 2, k1 | b1%UInt32<<24, b2%UInt32
+            cnt + 2, k1 | up24(b1), u32(b2)
         else
-            cnt + 2, k1 | shift_n_32(b1 | b2%UInt32<<8, cnt), 0%UInt32
+            cnt + 2, k1 | shift_n_32(b1 | up8(b2), cnt), u32(0)
         end
     elseif ch <= 0xffff
         b1, b2, b3 = get_utf8_3(ch)
         if (cnt & 7) == 1
-            cnt + 3, k1 | b1%UInt32<<8 | b2%UInt32<<16 | b3%UInt32<<24, 0%UInt32
+            cnt + 3, k1 | up8(b1) | up16(b2) | up24(b3), u32(0)
         elseif (cnt & 7) == 2
-            cnt + 3, k1 | b1%UInt32<<16 | b2%UInt32<<24, b3%UInt32
+            cnt + 3, mergebytes(k1, b1, b2), u32(b3)
         else
-            cnt + 3, k1 | b1%UInt32<<24, b2 | b3%UInt32<<8
+            cnt + 3, k1 | up24(b1), b2 | up8(b3)
         end
     else
         # This will always go over, may be 1, 2, 3 bytes in second word
         b1, b2, b3, b4 = get_utf8_4(ch)
         if (cnt & 7) == 1
-            cnt + 4, k1 | b1%UInt32<<8 | b2%UInt32<<16 | b3%UInt32<<24, b4%UInt32
+            cnt + 4, mergebytes(k1, b1, b2, b3), u32(b4)
         elseif (cnt & 7) == 2
-            cnt + 4, k1 | b1%UInt32<<16 | b2%UInt32<<24, b3 | b4%UInt32<<8
+            cnt + 4, mergebytes(k1, b1, b2), b3 | up8(b4)
         else
-            cnt + 4, k1 | b1%UInt32<<24, b2 | b3%UInt32<<8 | b4%UInt32<<16
+            cnt + 4, k1 | up24(b1), mergebytes(b2, b3, b4)
         end
     end
 end