diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c7fc43f..d4bd33a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -17,8 +17,8 @@ jobs: strategy: matrix: rust: - - "1.60.0" - - "1.65.0" + - "1.66.0" + - "1.76.0" steps: - uses: dtolnay/rust-toolchain@master with: diff --git a/Cargo.toml b/Cargo.toml index 2a604da..379f9a1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,3 +30,7 @@ harness = false [features] default = ["parallel"] parallel = ["rayon", "crossbeam-utils"] + +[profile.release] +lto = true +debug = 2 diff --git a/benches/build.rs b/benches/build.rs index 6e0da04..5daba91 100644 --- a/benches/build.rs +++ b/benches/build.rs @@ -4,34 +4,157 @@ extern crate bencher; use bencher::Bencher; -use boomphf::Mphf; +use boomphf::{ExternallyHashed, Mphf}; -fn build1_ser(bench: &mut Bencher) { +fn build1_ser_u64(bench: &mut Bencher) { + let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); + bench.iter(|| { + std::hint::black_box(Mphf::new(2.0, &items)); + }); +} + +fn build1_ser_externally_hashed(bench: &mut Bencher) { + let items: Vec = (0..1000000u64) + .map(|x| ExternallyHashed(wyhash::wyrng(&mut (x * 2)))) + .collect(); + bench.iter(|| { + std::hint::black_box(Mphf::new(2.0, &items)); + }); +} + +fn build1_ser_slices(bench: &mut Bencher) { + let items: Vec<[u8; 8]> = (0..1000000u64).map(|x| (x * 2).to_le_bytes()).collect(); + bench.iter(|| { + std::hint::black_box(Mphf::new(2.0, &items)); + }); +} + +fn build1_ser_long_slices(bench: &mut Bencher) { + let items = (0..1000000u64) + .map(|x| { + let mut long_key = [0u8; 128]; + long_key[0..8].copy_from_slice(&(x * 2).to_le_bytes()); + long_key + }) + .collect::>(); bench.iter(|| { - let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); - let _ = Mphf::new(2.0, &items); + std::hint::black_box(Mphf::new(2.0, &items)); + }); +} + +fn build1_ser_long_slices_externally_hashed(bench: &mut Bencher) { + let items = (0..1000000u64) + .map(|x| { + let mut long_key = [0u8; 128]; + long_key[0..8].copy_from_slice(&(x * 2).to_le_bytes()); + ExternallyHashed(wyhash::wyhash(&long_key, 0)) + }) + .collect::>(); + bench.iter(|| { + std::hint::black_box(Mphf::new(2.0, &items)); }); } #[allow(dead_code)] -fn build1_par(bench: &mut Bencher) { +fn build1_par_u64(bench: &mut Bencher) { + let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); #[cfg(feature = "parallel")] bench.iter(|| { - let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); - let _ = Mphf::new_parallel(2.0, &items, None); + std::hint::black_box(Mphf::new_parallel(2.0, &items, None)); }); } -fn scan1_ser(bench: &mut Bencher) { +#[allow(dead_code)] +fn build1_par_slices(bench: &mut Bencher) { + let items: Vec<[u8; 8]> = (0..1000000u64).map(|x| (x * 2).to_le_bytes()).collect(); + #[cfg(feature = "parallel")] + bench.iter(|| { + std::hint::black_box(Mphf::new_parallel(2.0, &items, None)); + }); +} + +fn scan1_ser_u64(bench: &mut Bencher) { let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); let phf = Mphf::new(2.0, &items); bench.iter(|| { - for i in (0..1000000u64).map(|x| x * 2) { - phf.hash(&i); + for i in &items { + std::hint::black_box(phf.hash(&i)); + } + }); +} + +fn scan1_ser_slice(bench: &mut Bencher) { + let items: Vec<[u8; 8]> = (0..1000000u64).map(|x| (x * 2).to_le_bytes()).collect(); + let phf = Mphf::new(2.0, &items); + + bench.iter(|| { + for i in &items { + std::hint::black_box(phf.hash(i)); + } + }); +} + +fn scan1_ser_externally_hashed(bench: &mut Bencher) { + let items: Vec = (0..1000000u64) + .map(|x| ExternallyHashed(wyhash::wyrng(&mut (x * 2)))) + .collect(); + let phf = Mphf::new(2.0, &items); + + bench.iter(|| { + for i in &items { + std::hint::black_box(phf.hash(i)); + } + }); +} + +fn scan1_ser_long_key(bench: &mut Bencher) { + let items = (0..1000000u64) + .map(|x| { + let mut long_key = [0u8; 128]; + long_key[0..8].copy_from_slice(&(x * 2).to_le_bytes()); + long_key + }) + .collect::>(); + let phf = Mphf::new(2.0, &items); + + bench.iter(|| { + for i in &items { + std::hint::black_box(phf.hash(i)); + } + }); +} + +fn scan1_ser_long_key_externally_hashed(bench: &mut Bencher) { + let items: Vec = (0..1000000u64) + .map(|x| { + let mut long_key = [0u8; 128]; + long_key[0..8].copy_from_slice(&(x * 2).to_le_bytes()); + ExternallyHashed(wyhash::wyhash(&long_key, 0)) + }) + .collect(); + let phf = Mphf::new(2.0, &items); + + bench.iter(|| { + for i in &items { + std::hint::black_box(phf.hash(i)); } }); } -benchmark_group!(benches, build1_ser, build1_par, scan1_ser); +benchmark_group!( + benches, + build1_ser_externally_hashed, + build1_ser_u64, + build1_ser_slices, + build1_ser_long_slices, + build1_ser_long_slices_externally_hashed, + build1_par_u64, + build1_par_slices, + scan1_ser_u64, + scan1_ser_slice, + scan1_ser_externally_hashed, + scan1_ser_long_key, + scan1_ser_long_key_externally_hashed +); benchmark_main!(benches); diff --git a/src/bitvector.rs b/src/bitvector.rs index b6e7b34..fbce59f 100644 --- a/src/bitvector.rs +++ b/src/bitvector.rs @@ -37,6 +37,92 @@ type Word = AtomicU64; #[cfg(not(feature = "parallel"))] type Word = u64; +#[cfg(target_endian = "little")] +#[derive(Clone, Copy, Debug)] +pub struct BitVectorRef<'a> { + bits: u64, + vector: &'a [u64], +} + +impl<'a> PartialEq for BitVectorRef<'a> { + fn eq(&self, other: &BitVectorRef<'_>) -> bool { + self.eq_left(other, self.bits) + } +} + +impl<'a> PartialEq for BitVectorRef<'a> { + fn eq(&self, other: &BitVector) -> bool { + self.eq_left(&other.as_ref(), self.bits) + } +} + +#[cfg(target_endian = "little")] +impl<'a> BitVectorRef<'a> { + #[cfg(target_endian = "little")] + pub(crate) fn from_dma(dma: [&'a [u8]; 2]) -> Self { + assert_eq!(dma[0].len(), std::mem::size_of::(), "Must be a u64"); + assert_eq!( + dma[1].len() % std::mem::size_of::(), + 0, + "Must be an even number of u64" + ); + assert_eq!( + dma[1].as_ptr().align_offset(std::mem::size_of::()), + 0, + "BitVectorRef must be passed an 8-byte aligned slice" + ); + + let vector = crate::u8_slice_cast(dma[1]); + + Self { + bits: u64::from_le_bytes(dma[0].try_into().unwrap()), + vector, + } + } + + /// the max number of elements can be inserted into set + pub fn capacity(&self) -> u64 { + self.bits + } + + /// If `bit` belongs to set, return `true`, else return `false`. + /// + /// Insert, remove and contains do not do bound check. + #[inline] + pub fn contains(&self, bit: u64) -> bool { + let (word, mask) = word_mask(bit); + (self.get_word(word) & mask) != 0 + } + + #[inline] + pub fn get_word(&self, word: usize) -> u64 { + self.vector[word] + } + + /// compare if the following is true: + /// + /// self \cap {0, 1, ... , bit - 1} == other \cap {0, 1, ... ,bit - 1} + pub fn eq_left(&self, other: &BitVectorRef<'_>, bit: u64) -> bool { + if bit == 0 { + return true; + } + let (word, offset) = word_offset(bit - 1); + // We can also use slice comparison, which only take 1 line. + // However, it has been reported that the `Eq` implementation of slice + // is extremly slow. + // + // self.vector.as_slice()[0 .. word] == other.vector.as_slice[0 .. word] + // + self.vector + .iter() + .zip(other.vector.iter()) + .take(word as usize) + .all(|(s1, s2)| *s1 == *s2) + && (self.get_word(word as usize) << (63 - offset)) + == (other.get_word(word as usize) << (63 - offset)) + } +} + /// Bitvector #[derive(Debug)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] @@ -53,6 +139,53 @@ pub struct BitVector { vector: Box<[u64]>, } +impl BitVector { + #[cfg(target_endian = "little")] + pub(crate) fn dma(&self) -> [&[u8]; 2] { + #[cfg(feature = "parallel")] + let vec: &[u64] = unsafe { std::mem::transmute::<&[AtomicU64], &[u64]>(&*self.vector) }; + + #[cfg(not(feature = "parallel"))] + let vec: &[u64] = &*self.vector; + + let vec_len = std::mem::size_of_val(vec); + let vec_ptr: *const [u64] = vec; + let vec_ptr = vec_ptr as *const u8; + + let vec: &[u8] = unsafe { std::slice::from_raw_parts(vec_ptr, vec_len) }; + + let bits: *const u64 = &self.bits; + let bits = + unsafe { std::slice::from_raw_parts(bits as *const u8, std::mem::size_of::()) }; + + [bits, vec] + } + + #[cfg(target_endian = "little")] + pub(crate) fn copy_from_dma(dma: [&[u8]; 2]) -> Self { + assert_eq!(dma[0].len(), std::mem::size_of::(), "Must be a u64"); + assert_eq!( + dma[1].len() % std::mem::size_of::(), + 0, + "Must be an even number of u64" + ); + + #[cfg(feature = "parallel")] + let mapper = |chunk: &[u8]| AtomicU64::new(u64::from_le_bytes(chunk.try_into().unwrap())); + + #[cfg(not(feature = "parallel"))] + let mapper = |chunk: &[u8]| u64::from_le_bytes(chunk.try_into().unwrap()); + + Self { + bits: u64::from_le_bytes(dma[0].try_into().unwrap()), + vector: dma[1] + .chunks_exact(std::mem::size_of::()) + .map(mapper) + .collect(), + } + } +} + // Custom serializer #[cfg(all(feature = "serde", feature = "parallel"))] fn ser_atomic_vec(v: &[AtomicU64], serializer: S) -> Result @@ -134,6 +267,12 @@ impl PartialEq for BitVector { } } +impl PartialEq> for BitVector { + fn eq(&self, other: &BitVectorRef<'_>) -> bool { + self.as_ref().eq_left(other, self.bits) + } +} + impl BitVector { /// Build a new empty bitvector pub fn new(bits: u64) -> Self { @@ -149,6 +288,19 @@ impl BitVector { } } + #[inline(always)] + pub fn as_ref<'a>(&'a self) -> BitVectorRef<'a> { + let vector = &*self.vector; + + #[cfg(feature = "parallel")] + let vector = unsafe { std::mem::transmute::<&[AtomicU64], &[u64]>(vector) }; + + BitVectorRef { + bits: self.bits, + vector, + } + } + /// new bitvector contains all elements /// /// If `bits % 64 > 0`, the last u64 is guaranteed not to @@ -209,37 +361,14 @@ impl BitVector { /// Insert, remove and contains do not do bound check. #[inline] pub fn contains(&self, bit: u64) -> bool { - let (word, mask) = word_mask(bit); - (self.get_word(word) & mask) != 0 + self.as_ref().contains(bit) } /// compare if the following is true: /// /// self \cap {0, 1, ... , bit - 1} == other \cap {0, 1, ... ,bit - 1} pub fn eq_left(&self, other: &BitVector, bit: u64) -> bool { - if bit == 0 { - return true; - } - let (word, offset) = word_offset(bit - 1); - // We can also use slice comparison, which only take 1 line. - // However, it has been reported that the `Eq` implementation of slice - // is extremly slow. - // - // self.vector.as_slice()[0 .. word] == other.vector.as_slice[0 .. word] - // - self.vector - .iter() - .zip(other.vector.iter()) - .take(word as usize) - .all(|(s1, s2)| { - #[cfg(feature = "parallel")] - return s1.load(Ordering::Relaxed) == s2.load(Ordering::Relaxed); - - #[cfg(not(feature = "parallel"))] - return s1 == s2; - }) - && (self.get_word(word as usize) << (63 - offset)) - == (other.get_word(word as usize) << (63 - offset)) + self.as_ref().eq_left(&other.as_ref(), bit) } /// insert a new element to set @@ -362,11 +491,7 @@ impl BitVector { #[inline] pub fn get_word(&self, word: usize) -> u64 { - #[cfg(feature = "parallel")] - return self.vector[word].load(Ordering::Relaxed) as u64; - - #[cfg(not(feature = "parallel"))] - return self.vector[word] as u64; + self.as_ref().get_word(word) } pub fn num_words(&self) -> usize { diff --git a/src/hashmap.rs b/src/hashmap.rs index 49df9f0..3d54135 100644 --- a/src/hashmap.rs +++ b/src/hashmap.rs @@ -3,17 +3,16 @@ #[cfg(feature = "serde")] use serde::{self, Deserialize, Serialize}; -use crate::Mphf; +use crate::{Mphf, SeedableHash}; use std::borrow::Borrow; use std::fmt::Debug; -use std::hash::Hash; use std::iter::ExactSizeIterator; /// A HashMap data structure where the mapping between keys and values is encoded in a Mphf. This lets us store the keys and values in dense /// arrays, with ~3 bits/item overhead in the Mphf. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct BoomHashMap { +pub struct BoomHashMap { mphf: Mphf, pub(crate) keys: Vec, pub(crate) values: Vec, @@ -21,7 +20,7 @@ pub struct BoomHashMap { impl BoomHashMap where - K: Hash + Debug + PartialEq, + K: SeedableHash + Debug + PartialEq, D: Debug, { fn create_map(mut keys: Vec, mut values: Vec, mphf: Mphf) -> BoomHashMap { @@ -49,7 +48,7 @@ where pub fn get(&self, kmer: &Q) -> Option<&D> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -69,7 +68,7 @@ where pub fn get_mut(&mut self, kmer: &Q) -> Option<&mut D> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -89,7 +88,7 @@ where pub fn get_key_id(&self, kmer: &Q) -> Option where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -133,7 +132,7 @@ where impl core::iter::FromIterator<(K, D)> for BoomHashMap where - K: Hash + Debug + PartialEq, + K: SeedableHash + Debug + PartialEq, D: Debug, { fn from_iter>(iter: I) -> Self { @@ -149,21 +148,21 @@ where } #[cfg(feature = "parallel")] -pub trait ConstructibleKey: Hash + Debug + PartialEq + Send + Sync {} +pub trait ConstructibleKey: SeedableHash + Debug + PartialEq + Send + Sync {} #[cfg(feature = "parallel")] -impl ConstructibleKey for T where T: Hash + Debug + PartialEq + Send + Sync {} +impl ConstructibleKey for T where T: SeedableHash + Debug + PartialEq + Send + Sync {} #[cfg(not(feature = "parallel"))] -pub trait ConstructibleKey: Hash + Debug + PartialEq {} +pub trait ConstructibleKey: SeedableHash + Debug + PartialEq {} #[cfg(not(feature = "parallel"))] -impl ConstructibleKey for T where T: Hash + Debug + PartialEq {} +impl ConstructibleKey for T where T: SeedableHash + Debug + PartialEq {} #[cfg(feature = "parallel")] impl BoomHashMap where - K: Hash + Debug + PartialEq + Send + Sync, + K: SeedableHash + Debug + PartialEq + Send + Sync, D: Debug, { /// Create a new hash map from the parallel array `keys` and `values`, using a parallelized method to construct the Mphf. @@ -174,12 +173,12 @@ where } /// Iterate over key-value pairs in a BoomHashMap -pub struct BoomIterator<'a, K: Hash + 'a, D: 'a> { +pub struct BoomIterator<'a, K: SeedableHash + 'a, D: 'a> { hash: &'a BoomHashMap, index: usize, } -impl<'a, K: Hash, D> Iterator for BoomIterator<'a, K, D> { +impl<'a, K: SeedableHash, D> Iterator for BoomIterator<'a, K, D> { type Item = (&'a K, &'a D); fn next(&mut self) -> Option { @@ -199,9 +198,9 @@ impl<'a, K: Hash, D> Iterator for BoomIterator<'a, K, D> { } } -impl<'a, K: Hash, D1> ExactSizeIterator for BoomIterator<'a, K, D1> {} +impl<'a, K: SeedableHash, D1> ExactSizeIterator for BoomIterator<'a, K, D1> {} -impl<'a, K: Hash, D> IntoIterator for &'a BoomHashMap { +impl<'a, K: SeedableHash, D> IntoIterator for &'a BoomHashMap { type Item = (&'a K, &'a D); type IntoIter = BoomIterator<'a, K, D>; @@ -219,19 +218,19 @@ impl<'a, K: Hash, D> IntoIterator for &'a BoomHashMap { /// arrays, with ~3 bits/item overhead in the Mphf. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct BoomHashMap2 { +pub struct BoomHashMap2 { mphf: Mphf, keys: Vec, values: Vec, aux_values: Vec, } -pub struct Boom2Iterator<'a, K: Hash + 'a, D1: 'a, D2: 'a> { +pub struct Boom2Iterator<'a, K: SeedableHash + 'a, D1: 'a, D2: 'a> { hash: &'a BoomHashMap2, index: usize, } -impl<'a, K: Hash, D1, D2> Iterator for Boom2Iterator<'a, K, D1, D2> { +impl<'a, K: SeedableHash, D1, D2> Iterator for Boom2Iterator<'a, K, D1, D2> { type Item = (&'a K, &'a D1, &'a D2); fn next(&mut self) -> Option { @@ -254,9 +253,9 @@ impl<'a, K: Hash, D1, D2> Iterator for Boom2Iterator<'a, K, D1, D2> { } } -impl<'a, K: Hash, D1, D2> ExactSizeIterator for Boom2Iterator<'a, K, D1, D2> {} +impl<'a, K: SeedableHash, D1, D2> ExactSizeIterator for Boom2Iterator<'a, K, D1, D2> {} -impl<'a, K: Hash, D1, D2> IntoIterator for &'a BoomHashMap2 { +impl<'a, K: SeedableHash, D1, D2> IntoIterator for &'a BoomHashMap2 { type Item = (&'a K, &'a D1, &'a D2); type IntoIter = Boom2Iterator<'a, K, D1, D2>; @@ -270,7 +269,7 @@ impl<'a, K: Hash, D1, D2> IntoIterator for &'a BoomHashMap2 { impl BoomHashMap2 where - K: Hash + Debug + PartialEq, + K: SeedableHash + Debug + PartialEq, D1: Debug, D2: Debug, { @@ -310,7 +309,7 @@ where pub fn get(&self, kmer: &Q) -> Option<(&D1, &D2)> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -329,7 +328,7 @@ where pub fn get_mut(&mut self, kmer: &Q) -> Option<(&mut D1, &mut D2)> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -351,7 +350,7 @@ where pub fn get_key_id(&self, kmer: &Q) -> Option where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -395,7 +394,7 @@ where impl core::iter::FromIterator<(K, D1, D2)> for BoomHashMap2 where - K: Hash + Debug + PartialEq, + K: SeedableHash + Debug + PartialEq, D1: Debug, D2: Debug, { @@ -416,7 +415,7 @@ where #[cfg(feature = "parallel")] impl BoomHashMap2 where - K: Hash + Debug + PartialEq + Send + Sync, + K: SeedableHash + Debug + PartialEq + Send + Sync, D1: Debug, D2: Debug, { @@ -500,7 +499,7 @@ where pub fn get(&self, kmer: &Q) -> Option<&D1> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -513,7 +512,7 @@ where pub fn get_mut(&mut self, kmer: &Q) -> Option<&mut D1> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { @@ -619,7 +618,7 @@ where pub fn get(&self, kmer: &Q) -> Option<(&D1, &D2)> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); maybe_pos.map(|pos| (&self.values[pos as usize], &self.aux_values[pos as usize])) @@ -629,7 +628,7 @@ where pub fn get_mut(&mut self, kmer: &Q) -> Option<(&mut D1, &mut D2)> where K: Borrow, - Q: Hash + Eq, + Q: SeedableHash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); maybe_pos.map(|pos| { diff --git a/src/lib.rs b/src/lib.rs index 32ca5fc..27164a4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,6 +37,8 @@ pub mod hashmap; #[cfg(feature = "parallel")] mod par_iter; use bitvector::BitVector; +#[cfg(target_endian = "little")] +use bitvector::BitVectorRef; use log::error; use std::borrow::Borrow; @@ -52,21 +54,184 @@ use std::sync::{Arc, Mutex}; #[cfg(feature = "serde")] use serde::{self, Deserialize, Serialize}; -#[inline] -fn fold(v: u64) -> u32 { - ((v & 0xFFFFFFFF) as u32) ^ ((v >> 32) as u32) +/// fastmod used to construct the seed as 1 << (iters + iters). However, for external hashing +/// there's a faster path available via lookup tables if we just pass in iters. This method is +/// to ensure that pre-existing hashes continue to work as before when not using ExternallyHashed. +#[inline(always)] +fn default_seed_correction(seed: u64) -> u64 { + 1 << (seed + seed) } -#[inline] -fn hash_with_seed(iter: u64, v: &T) -> u64 { - let mut state = wyhash::WyHash::with_seed(1 << (iter + iter)); - v.hash(&mut state); +fn default_hash_with_seed(value: &T, seed: u64) -> u64 { + let mut state = wyhash::WyHash::with_seed(1 << (seed + seed)); + value.hash(&mut state); state.finish() } +// This custom trait allows us to fast-path &[u8] to avoid constructing the temporary Hasher object. +// Can be simplified once specialization is stabilized. +pub trait SeedableHash { + fn hash_with_seed(&self, seed: u64) -> u64; +} + +impl SeedableHash for [u8] { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(self, default_seed_correction(seed)) + } +} + +impl SeedableHash for [u8; N] { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(self, default_seed_correction(seed)) + } +} + +impl SeedableHash for u8 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&[*self], default_seed_correction(seed)) + } +} + +impl SeedableHash for i16 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for u16 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for i32 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for u32 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for i64 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for u64 { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for isize { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for usize { + fn hash_with_seed(&self, seed: u64) -> u64 { + wyhash::wyhash(&self.to_le_bytes(), default_seed_correction(seed)) + } +} + +impl SeedableHash for &T { + fn hash_with_seed(&self, seed: u64) -> u64 { + (**self).hash_with_seed(seed) + } +} + +impl SeedableHash for &[T] { + fn hash_with_seed(&self, seed: u64) -> u64 { + default_hash_with_seed(self, seed) + } +} + +impl SeedableHash for Vec { + fn hash_with_seed(&self, seed: u64) -> u64 { + default_hash_with_seed(self, seed) + } +} + +impl SeedableHash for &str { + fn hash_with_seed(&self, seed: u64) -> u64 { + default_hash_with_seed(self, seed) + } +} + +impl SeedableHash for String { + fn hash_with_seed(&self, seed: u64) -> u64 { + default_hash_with_seed(self, seed) + } +} + +/// This is a fast-path where the hash for an entry is known externally. That way we can skip hashing the +/// key for building / lookups which provides savings as keys grow longer or you need to do a lookup of the +/// same key across multiple perfect hashes. It's the user's responsibility to construct this with a value +/// that is deterministically derived from a key. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ExternallyHashed(pub u64); + +impl ExternallyHashed { + // Helper function for wyrng. + const fn wymum(a: u64, b: u64) -> u64 { + let mul = a as u128 * b as u128; + ((mul >> 64) ^ mul) as u64 + } + + // wyrng except a constified version + const fn wyrng(seed: u64) -> u64 { + const P0: u64 = 0xa076_1d64_78bd_642f; + const P1: u64 = 0xe703_7ed1_a0b4_28db; + + let seed = seed.wrapping_add(P0); + Self::wymum(seed ^ P1, seed) + } + + // Generate lookup tables to map the hash seed to a random value. + const fn gen_seed_lookups() -> [u64; MAX_ITERS as usize + 1] { + let mut result = [0; MAX_ITERS as usize + 1]; + let mut i = 0; + while i <= MAX_ITERS { + result[i as usize] = Self::wyrng(i); + i += 1; + } + result + } + const SEED_HASH_LOOKUP_TABLES: [u64; MAX_ITERS as usize + 1] = Self::gen_seed_lookups(); + + // Helper utility to convert the seed passed in from hashmod (which is in 0..MAX_ITERS) into a hash. + fn fast_seed_hash(x: u64) -> u64 { + debug_assert!(x <= MAX_ITERS); + Self::SEED_HASH_LOOKUP_TABLES[x as usize] + } + + // Quickly combine two hashes. Because .0 represents a hash, we know it's random and doesn't need to be + // independently hashed again, so we just need to combine it uniquely with iters. + fn hash_combine(h1: u64, h2: u64) -> u64 { + // https://stackoverflow.com/questions/5889238/why-is-xor-the-default-way-to-combine-hashes + h1 ^ (h2 + .wrapping_add(0x517cc1b727220a95) + .wrapping_add(h1 << 6) + .wrapping_add(h1 >> 2)) + } +} + +impl SeedableHash for ExternallyHashed { + #[inline(always)] + fn hash_with_seed(&self, seed: u64) -> u64 { + Self::hash_combine(self.0, Self::fast_seed_hash(seed)) + } +} + #[inline] -fn hash_with_seed32(iter: u64, v: &T) -> u32 { - fold(hash_with_seed(iter, v)) +fn fold(v: u64) -> u32 { + ((v & 0xFFFFFFFF) as u32) ^ ((v >> 32) as u32) } #[inline] @@ -75,15 +240,139 @@ fn fastmod(hash: u32, n: u32) -> u64 { } #[inline] -fn hashmod(iter: u64, v: &T, n: u64) -> u64 { +fn hashmod(iter: u64, v: &T, n: u64) -> u64 { // when n < 2^32, use the fast alternative to modulo described here: // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ + let h = v.hash_with_seed(iter); if n < (1 << 32) { - let h = hash_with_seed32(iter, v); - fastmod(h, n as u32) as u64 + fastmod(fold(h), n as u32) as u64 } else { - let h = hash_with_seed(iter, v); - h % (n as u64) + h % n + } +} + +#[cfg(target_endian = "little")] +pub(crate) fn u8_slice_cast(s: &[u8]) -> &[T] { + assert_eq!(s.len() % std::mem::size_of::(), 0, "Invalid length"); + assert_eq!( + s.as_ptr().align_offset(std::mem::size_of::()), + 0, + "Misaligned - not safe" + ); + + let converted_length = s.len() / std::mem::size_of::(); + let converted: *const [u8] = s; + unsafe { std::slice::from_raw_parts(converted as *const T, converted_length) } +} + +/// Can only be used for lookups and same interface as Mphf. This is useful if you have the serialized +/// form stored somewhere in a raw uncompressed form and just want to reference it cheaply without copying +/// the bulk of the underlying data. In the future it might be possible to define an efficient contiguous +/// layout that lets us avoid even the heap allocation although it might be tricky since this is an array +/// of tuples where each tuple itself has 2 arrays (i.e. randomly accessing the BitVectorRef is tricky +/// without defining a fast serialized representation of an index to aide in that endeavour). +/// +/// Note: the type parameter doesn't actually mean anything since you can +/// launder by round-tripping through serialization (but that's true for serde as well). +#[derive(Clone, Debug)] +pub struct MphfRef<'a, T> { + bitvecs: Box<[(BitVectorRef<'a>, &'a [u64])]>, + phantom: PhantomData, +} + +impl<'a, T> MphfRef<'a, T> { + /// This takes an iterator of slices that compose all the parts of the original [Mphf] as returned by [Mphf::to_dma]. + /// This requires some memory allocations to set up the internal data structures, but the underlying data itself isn't + /// copied at all. + #[cfg(target_endian = "little")] + pub fn from_scatter_dma(mut dma: impl ExactSizeIterator) -> Self { + assert_eq!(dma.len() % 3, 0); + let num_bitvecs = dma.len() / 3; + let mut bitvecs = Vec::with_capacity(num_bitvecs); + + for _ in 0..num_bitvecs { + let bitvec = BitVectorRef::from_dma([dma.next().unwrap(), dma.next().unwrap()]); + let raw_ranks = dma.next().unwrap(); + bitvecs.push((bitvec, u8_slice_cast(raw_ranks))); + } + + assert_eq!(dma.next(), None, "Didn't properly consume DMA stream"); + + Self { + bitvecs: bitvecs.into_boxed_slice(), + phantom: Default::default(), + } + } + + #[inline] + fn get_rank(&self, hash: u64, i: usize) -> u64 { + let idx = hash as usize; + let (bv, ranks) = self.bitvecs.get(i).expect("that level doesn't exist"); + + // Last pre-computed rank + let mut rank = ranks[idx / 512]; + + // Add rank of intervening words + for j in (idx / 64) & !7..idx / 64 { + rank += bv.get_word(j).count_ones() as u64; + } + + // Add rank of final word up to hash + let final_word = bv.get_word(idx / 64); + if idx % 64 > 0 { + rank += (final_word << (64 - (idx % 64))).count_ones() as u64; + } + rank + } + + /// Compute the hash value of `item`. This method should only be used + /// with items known to be in construction set. Use `try_hash` if you cannot + /// guarantee that `item` was in the construction set. If `item` was not present + /// in the construction set this function may panic. + pub fn hash(&self, item: &Q) -> u64 + where + T: Borrow, + Q: ?Sized + SeedableHash, + { + for i in 0..self.bitvecs.len() { + let (bv, _) = &self.bitvecs[i]; + let hash = hashmod(i as u64, item, bv.capacity()); + + if bv.contains(hash) { + return self.get_rank(hash, i); + } + } + + unreachable!("must find a hash value"); + } + + /// Compute the hash value of `item`. If `item` was not present + /// in the set of objects used to construct the hash function, the return + /// value will an arbitrary value Some(x), or None. + pub fn try_hash(&self, item: &Q) -> Option + where + T: Borrow, + Q: ?Sized + SeedableHash, + { + for i in 0..self.bitvecs.len() { + let (bv, _) = &(self.bitvecs)[i]; + let hash = hashmod(i as u64, item, bv.capacity()); + + if bv.contains(hash) { + return Some(self.get_rank(hash, i)); + } + } + + None + } +} + +impl<'a, T> PartialEq> for MphfRef<'a, T> { + fn eq(&self, other: &Mphf) -> bool { + self.bitvecs + .iter() + .zip(other.bitvecs.iter()) + .all(|(b1, b2)| b1.0 == b2.0 && b1.1 == &*b2.1) } } @@ -95,9 +384,113 @@ pub struct Mphf { phantom: PhantomData, } +impl PartialEq for Mphf { + fn eq(&self, other: &Self) -> bool { + self.bitvecs == other.bitvecs + } +} + +impl PartialEq> for Mphf { + fn eq(&self, other: &MphfRef<'_, T>) -> bool { + self.bitvecs + .iter() + .zip(other.bitvecs.iter()) + .all(|(b1, b2)| b1.0 == b2.0 && &*b1.1 == b2.1) + } +} + const MAX_ITERS: u64 = 100; -impl<'a, T: 'a + Hash + Debug> Mphf { +impl Mphf { + pub fn num_dma_slices(&self) -> usize { + // See to_dma. + self.bitvecs.len() * 3 + } + + #[cfg(target_endian = "little")] + pub fn to_dma(&self) -> Vec<&[u8]> { + // Each bitvec has 3 DMA regions within it: the bits value within BitVector, the vector within BitVector and the ranks. + let mut scattered = Vec::with_capacity(self.bitvecs.len() * 3); + for (bitvec, ranks) in self.bitvecs.iter() { + for piece in bitvec.dma() { + scattered.push(piece) + } + + let ranks = &**ranks; + let ranks_len = std::mem::size_of_val(ranks); + let ranks_ptr: *const [u64] = ranks; + let ranks_ptr: *const u64 = ranks_ptr as *const u64; + scattered + .push(unsafe { std::slice::from_raw_parts(ranks_ptr as *const u8, ranks_len) }); + } + + assert_eq!(scattered.len() % 3, 0); + scattered + } + + #[cfg(target_endian = "little")] + pub fn copy_from_scatter_dma<'a>(mut dma: impl ExactSizeIterator) -> Self { + assert_eq!(dma.len() % 3, 0); + let num_bitvecs = dma.len() / 3; + let mut bitvecs = Vec::with_capacity(num_bitvecs); + for _ in 0..num_bitvecs { + let bitvec = BitVector::copy_from_dma([dma.next().unwrap(), dma.next().unwrap()]); + let ranks = u8_slice_cast::(dma.next().unwrap()).into(); + bitvecs.push((bitvec, ranks)); + } + assert_eq!(dma.next(), None, "Didn't fully consume DMA slices"); + + Self { + bitvecs: bitvecs.into_boxed_slice(), + phantom: Default::default(), + } + } + + fn compute_ranks(bvs: Vec) -> Box<[(BitVector, Box<[u64]>)]> { + let mut ranks = Vec::new(); + let mut pop = 0_u64; + + for bv in bvs { + let mut rank: Vec = Vec::new(); + for i in 0..bv.num_words() { + let v = bv.get_word(i); + + if i % 8 == 0 { + rank.push(pop) + } + + pop += v.count_ones() as u64; + } + + ranks.push((bv, rank.into_boxed_slice())) + } + + ranks.into_boxed_slice() + } + + #[inline] + fn get_rank(&self, hash: u64, i: usize) -> u64 { + let idx = hash as usize; + let (bv, ranks) = self.bitvecs.get(i).expect("that level doesn't exist"); + + // Last pre-computed rank + let mut rank = ranks[idx / 512]; + + // Add rank of intervening words + for j in (idx / 64) & !7..idx / 64 { + rank += bv.get_word(j).count_ones() as u64; + } + + // Add rank of final word up to hash + let final_word = bv.get_word(idx / 64); + if idx % 64 > 0 { + rank += (final_word << (64 - (idx % 64))).count_ones() as u64; + } + rank + } +} + +impl<'a, T: 'a + SeedableHash + Debug> Mphf { /// Constructs an MPHF from a (possibly lazy) iterator over iterators. /// This allows construction of very large MPHFs without holding all the keys /// in memory simultaneously. @@ -127,7 +520,7 @@ impl<'a, T: 'a + Hash + Debug> Mphf { loop { if iter > MAX_ITERS { error!("ran out of key space. items: {:?}", done_keys.len()); - panic!("counldn't find unique hashes"); + panic!("couldn't find unique hashes"); } let keys_remaining = if iter == 0 { @@ -199,7 +592,7 @@ impl<'a, T: 'a + Hash + Debug> Mphf { object_pos = object_index + 1; - let idx = hashmod(seed, &key, size); + let idx = hashmod(seed, &&key, size); if collide.contains(idx) { a.remove(idx); @@ -226,7 +619,7 @@ impl<'a, T: 'a + Hash + Debug> Mphf { } } -impl Mphf { +impl Mphf { /// Generate a minimal perfect hash function for the set of `objects`. /// `objects` must not contain any duplicate items. /// `gamma` controls the tradeoff between the construction-time and run-time speed, @@ -274,49 +667,6 @@ impl Mphf { } } - fn compute_ranks(bvs: Vec) -> Box<[(BitVector, Box<[u64]>)]> { - let mut ranks = Vec::new(); - let mut pop = 0_u64; - - for bv in bvs { - let mut rank: Vec = Vec::new(); - for i in 0..bv.num_words() { - let v = bv.get_word(i); - - if i % 8 == 0 { - rank.push(pop) - } - - pop += v.count_ones() as u64; - } - - ranks.push((bv, rank.into_boxed_slice())) - } - - ranks.into_boxed_slice() - } - - #[inline] - fn get_rank(&self, hash: u64, i: usize) -> u64 { - let idx = hash as usize; - let (bv, ranks) = self.bitvecs.get(i).expect("that level doesn't exist"); - - // Last pre-computed rank - let mut rank = ranks[idx / 512]; - - // Add rank of intervening words - for j in (idx / 64) & !7..idx / 64 { - rank += bv.get_word(j).count_ones() as u64; - } - - // Add rank of final word up to hash - let final_word = bv.get_word(idx / 64); - if idx % 64 > 0 { - rank += (final_word << (64 - (idx % 64))).count_ones() as u64; - } - rank - } - /// Compute the hash value of `item`. This method should only be used /// with items known to be in construction set. Use `try_hash` if you cannot /// guarantee that `item` was in the construction set. If `item` was not present @@ -324,7 +674,7 @@ impl Mphf { pub fn hash(&self, item: &T) -> u64 { for i in 0..self.bitvecs.len() { let (bv, _) = &self.bitvecs[i]; - let hash = hashmod(i as u64, item, bv.capacity() as u64); + let hash = hashmod(i as u64, item, bv.capacity()); if bv.contains(hash) { return self.get_rank(hash, i); @@ -340,11 +690,11 @@ impl Mphf { pub fn try_hash(&self, item: &Q) -> Option where T: Borrow, - Q: ?Sized + Hash, + Q: ?Sized + SeedableHash, { for i in 0..self.bitvecs.len() { let (bv, _) = &(self.bitvecs)[i]; - let hash = hashmod(i as u64, item, bv.capacity() as u64); + let hash = hashmod(i as u64, item, bv.capacity()); if bv.contains(hash) { return Some(self.get_rank(hash, i)); @@ -356,7 +706,7 @@ impl Mphf { } #[cfg(feature = "parallel")] -impl Mphf { +impl Mphf { /// Same as `new`, but parallelizes work on the rayon default Rayon threadpool. /// Configure the number of threads on that threadpool to control CPU usage. #[cfg(feature = "parallel")] @@ -418,7 +768,7 @@ struct Context { impl Context { fn new(size: u64, seed: u64) -> Self { Self { - size: size as u64, + size, seed, a: BitVector::new(size), collide: BitVector::new(size), @@ -426,14 +776,14 @@ impl Context { } #[cfg(feature = "parallel")] - fn find_collisions(&self, v: &T) { + fn find_collisions(&self, v: &T) { let idx = hashmod(self.seed, v, self.size); if !self.collide.contains(idx) && !self.a.insert(idx) { self.collide.insert(idx); } } - fn find_collisions_sync(&mut self, v: &T) { + fn find_collisions_sync(&mut self, v: &T) { let idx = hashmod(self.seed, v, self.size); if !self.collide.contains(idx) && !self.a.insert_sync(idx) { self.collide.insert_sync(idx); @@ -441,7 +791,7 @@ impl Context { } #[cfg(feature = "parallel")] - fn filter<'t, T: Hash>(&self, v: &'t T) -> Option<&'t T> { + fn filter<'t, T: SeedableHash>(&self, v: &'t T) -> Option<&'t T> { let idx = hashmod(self.seed, v, self.size); if self.collide.contains(idx) { self.a.remove(idx); @@ -452,7 +802,7 @@ impl Context { } #[cfg(not(feature = "parallel"))] - fn filter<'t, T: Hash>(&mut self, v: &'t T) -> Option<&'t T> { + fn filter<'t, T: SeedableHash>(&mut self, v: &'t T) -> Option<&'t T> { let idx = hashmod(self.seed, v, self.size); if self.collide.contains(idx) { self.a.remove(idx); @@ -533,7 +883,10 @@ where } #[cfg(feature = "parallel")] -impl<'a, T: 'a + Hash + Debug + Send + Sync> Mphf { +impl<'a, T: 'a + SeedableHash + Debug + Send + Sync> Mphf +where + &'a T: SeedableHash, +{ /// Same as to `from_chunked_iterator` but parallelizes work over `num_threads` threads. #[cfg(feature = "parallel")] pub fn from_chunked_iterator_parallel( @@ -569,7 +922,7 @@ impl<'a, T: 'a + Hash + Debug + Send + Sync> Mphf { loop { if max_iters.is_some() && iter > max_iters.unwrap() { error!("ran out of key space. items: {:?}", global.done_keys.len()); - panic!("counldn't find unique hashes"); + panic!("couldn't find unique hashes"); } let keys_remaining = if iter == 0 { @@ -701,7 +1054,7 @@ mod tests { /// Check that a Minimal perfect hash function (MPHF) is generated for the set xs fn check_mphf(xs: HashSet) -> bool where - T: Sync + Hash + PartialEq + Eq + Debug + Send, + T: Sync + SeedableHash + PartialEq + Eq + Debug + Send, { let xsv: Vec = xs.into_iter().collect(); @@ -712,7 +1065,7 @@ mod tests { /// Check that a Minimal perfect hash function (MPHF) is generated for the set xs fn check_mphf_serial(xsv: &[T]) -> bool where - T: Hash + PartialEq + Eq + Debug, + T: SeedableHash + PartialEq + Eq + Debug, { // Generate the MPHF let phf = Mphf::new(1.7, xsv); @@ -731,7 +1084,7 @@ mod tests { #[cfg(feature = "parallel")] fn check_mphf_parallel(xsv: &[T]) -> bool where - T: Sync + Hash + PartialEq + Eq + Debug + Send, + T: Sync + SeedableHash + PartialEq + Eq + Debug + Send, { // Generate the MPHF let phf = Mphf::new_parallel(1.7, xsv, None); @@ -749,14 +1102,14 @@ mod tests { #[cfg(not(feature = "parallel"))] fn check_mphf_parallel(_xsv: &[T]) -> bool where - T: Hash + PartialEq + Eq + Debug, + T: SeedableHash + PartialEq + Eq + Debug, { true } fn check_chunked_mphf(values: Vec>, total: u64) -> bool where - T: Sync + Hash + PartialEq + Eq + Debug + Send, + T: Sync + SeedableHash + PartialEq + Eq + Debug + Send, { let phf = Mphf::from_chunked_iterator(1.7, &values, total); @@ -776,7 +1129,7 @@ mod tests { #[cfg(feature = "parallel")] fn check_chunked_mphf_parallel(values: Vec>, total: u64) -> bool where - T: Sync + Hash + PartialEq + Eq + Debug + Send, + T: Sync + SeedableHash + PartialEq + Eq + Debug + Send, { let phf = Mphf::from_chunked_iterator_parallel(1.7, &values, None, total, 2); @@ -883,4 +1236,55 @@ mod tests { let items = (0..1000000).map(|x| x * 2); assert!(check_mphf(HashSet::from_iter(items))); } + + #[test] + fn externally_hashed() { + let total = 1000000; + // User gets to pick the hash function. + let entries = (0..total) + .map(|x| ExternallyHashed(wyhash::wyrng(&mut (x * 2)))) + .collect::>(); + let phf = Mphf::new(1.7, &entries); + + let mut hashes = entries.iter().map(|eh| phf.hash(eh)).collect::>(); + hashes.sort_unstable(); + + let gt = (0..total as u64).collect::>(); + assert_eq!(hashes, gt); + + // Hand-picked a value that fails to hash since it's not in the original set that it's built from. + // It's not ideal that this assertion is sensitive to the implementation details internal to Mphf. + assert_eq!( + phf.try_hash(&ExternallyHashed(wyhash::wyrng(&mut 1000129))), + None + ); + } + + #[test] + fn dma_serde() { + // User gets to pick the hash function. + let items = (0..1000000).map(|x| x * 2).collect::>(); + let phf = Mphf::new(1.7, &items); + + let serialized = phf.to_dma(); + + let phf2 = Mphf::::copy_from_scatter_dma(serialized.iter().copied()); + let phf3 = MphfRef::::from_scatter_dma(serialized.iter().copied()); + + assert_eq!(phf, phf2); + assert_eq!(phf2, phf); + + assert_eq!(phf, phf3); + assert_eq!(phf3, phf); + + for i in items { + assert_eq!(phf.try_hash(&i), phf2.try_hash(&i)); + assert_eq!(phf.try_hash(&i), phf3.try_hash(&i)); + } + + for i in 1000000..1000000 * 2 { + assert_eq!(phf.try_hash(&i), phf2.try_hash(&i)); + assert_eq!(phf.try_hash(&i), phf3.try_hash(&i)); + } + } } diff --git a/src/par_iter.rs b/src/par_iter.rs index 0ced54d..a00c530 100644 --- a/src/par_iter.rs +++ b/src/par_iter.rs @@ -1,12 +1,11 @@ -use std::hash::Hash; - use crate::hashmap::BoomHashMap; +use crate::SeedableHash; use rayon::iter::plumbing::{bridge, Consumer, Producer, ProducerCallback, UnindexedConsumer}; use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; impl<'data, K, V> IntoParallelIterator for &'data BoomHashMap where - K: Hash + Sync + 'data, + K: SeedableHash + Sync + 'data, V: Sync + 'data, { type Item = (&'data K, &'data V);