Skip to content

Commit

Permalink
Merge pull request #397 from knowsys/dict-exp
Browse files Browse the repository at this point in the history
use HashMapDictionary
  • Loading branch information
larry-gonzalez authored Nov 6, 2023
2 parents 1f6232c + c5f805d commit 7b544a1
Show file tree
Hide file tree
Showing 23 changed files with 1,900 additions and 182 deletions.
57 changes: 57 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion nemo-benches/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,21 @@ license.workspace = true
readme = "README.md"
repository.workspace = true

[[bin]]
name = "dict-bench"
path = "src/bin/dict-bench.rs"

[dependencies]
nemo-physical = { path = "../nemo-physical", default-features = false }
nemo = { path = "../nemo", default-features = false }
rand = "0.8.5"
flate2 = "1"
log = { version = "0.4", features = [ "max_level_trace", "release_max_level_trace" ] }
clap = { version = "4.0.32", features = [ "derive", "cargo", "env" ] }
colored = "2"
env_logger = "*"

[dev-dependencies]
env_logger = "*"
criterion = { version = "0.5", features = [ "html_reports" ] }
rand_pcg = "0.3"

Expand Down
8 changes: 4 additions & 4 deletions nemo-benches/benches/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use nemo_physical::{
builder_proxy::{
ColumnBuilderProxy, PhysicalBuilderProxyEnum, PhysicalStringColumnBuilderProxy,
},
dictionary::PrefixedStringDictionary,
dictionary::HashMapDictionary,
};
use rand::{distributions::Alphanumeric, prelude::*};
use rand_pcg::Pcg64;
Expand Down Expand Up @@ -35,7 +35,7 @@ pub fn benchmark_input(c: &mut Criterion) {
group.bench_function("read_strings", |b| {
b.iter_batched(
|| {
let dict = std::cell::RefCell::new(PrefixedStringDictionary::default());
let dict = std::cell::RefCell::new(HashMapDictionary::default());
(strings.clone(), dict)
},
|(input, dict)| {
Expand All @@ -53,7 +53,7 @@ pub fn benchmark_input(c: &mut Criterion) {
group.bench_function("read_terms", |b| {
b.iter_batched(
|| {
let dict = std::cell::RefCell::new(PrefixedStringDictionary::default());
let dict = std::cell::RefCell::new(HashMapDictionary::default());
(terms.clone(), dict)
},
|(input, dict)| {
Expand All @@ -71,7 +71,7 @@ pub fn benchmark_input(c: &mut Criterion) {
group.bench_function("read_iris", |b| {
b.iter_batched(
|| {
let dict = std::cell::RefCell::new(PrefixedStringDictionary::default());
let dict = std::cell::RefCell::new(HashMapDictionary::default());
(iris.clone(), dict)
},
|(input, dict)| {
Expand Down
119 changes: 119 additions & 0 deletions nemo-benches/src/bin/dict-bench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
use flate2::read::MultiGzDecoder;
use std::env;
use std::fs::File;
use std::io::prelude::*;
use std::io::stdin;
use std::io::BufReader;

use nemo::meta::{timing::TimedDisplay, TimedCode};
use nemo_physical::dictionary::{
hash_map_dictionary::HashMapDictionary, meta_dictionary::MetaDictionary,
prefixed_string_dictionary::PrefixedStringDictionary, string_dictionary::StringDictionary,
AddResult, Dictionary,
};

fn create_dictionary(dict_type: &str) -> Box<dyn Dictionary> {
match dict_type {
"hash" => {
println!("Using StringDictionary.");
Box::new(StringDictionary::new())
}
"hashmap" => {
println!("Using HashMapDictionary.");
Box::new(HashMapDictionary::new())
}
"prefix" => {
println!("Using PrefixedStringDictionary.");
Box::new(PrefixedStringDictionary::new())
}
"meta" => {
println!("Using MetaDictionary.");
Box::new(MetaDictionary::new())
}
_ => panic!("Unexpected dictionary type '{}'.", dict_type),
}
}

fn main() {
env_logger::init();
TimedCode::instance().start();

let args: Vec<_> = env::args().collect();
if args.len() < 3 {
println!("Usage: dict-bench <filename> <dicttype> <nonstop>");
println!(
" <filename> File with dictionary entries, one per line, possibly with duplicates."
);
println!(
" <dicttype> Identifier for the dictionary to test, e.g., \"hash\" or \"prefix\"."
);
println!(
" <nonstop> If anything is given here, the program will terminate without asking for a prompt."
);
}

let filename = &args[1];
let dicttype = &args[2];

let reader = BufReader::new(MultiGzDecoder::new(
File::open(filename).expect("Cannot open file."),
));

let mut dict = create_dictionary(dicttype);
let mut count_lines = 0;
let mut count_unique = 0;
let mut bytes = 0;

TimedCode::instance().sub("Dictionary filling").start();

println!("Starting to fill dictionary ...");

for l in reader.lines() {
let s = l.unwrap();
let b = s.len();

let entry_status = dict.add_string(s);
match entry_status {
AddResult::Fresh(_value) => {
bytes += b;
count_unique += 1;
}
AddResult::Known(_value) => {}
AddResult::Rejected => {}
}

count_lines += 1;
}

TimedCode::instance().sub("Dictionary filling").stop();

println!(
"Processed {} strings (dictionary contains {} unique strings with {} bytes overall).",
count_lines, count_unique, bytes
);

TimedCode::instance().stop();

println!(
"\n{}",
TimedCode::instance().create_tree_string(
"dict-bench",
&[
TimedDisplay::default(),
TimedDisplay::default(),
TimedDisplay::new(nemo::meta::timing::TimedSorting::LongestThreadTime, 0)
]
)
);

if args.len() < 4 {
println!("All done. Press return to end benchmark (and free all memory).");
let mut s = String::new();
stdin().read_line(&mut s).expect("No string entered?");
}

if dict.len() == 123456789 {
// FWIW, prevent dict from going out of scope before really finishing
println!("Today is your lucky day.");
}
}
3 changes: 0 additions & 3 deletions nemo-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@ license.workspace = true
readme = "README.md"
repository.workspace = true

[features]
no-prefixed-string-dictionary = ["nemo/no-prefixed-string-dictionary"]

[[bin]]
name = "nmo"
path = "src/main.rs"
Expand Down
3 changes: 2 additions & 1 deletion nemo-physical/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ default = ["timing"]
# Enables time measurements using the "howlong" crate
# If this feature is not enabled, all time measurements will display zero instead
timing = ["dep:howlong"]
no-prefixed-string-dictionary = []

[dependencies]
log = "0.4"
Expand All @@ -24,10 +23,12 @@ num = "0.4.0"
ascii_tree = "0.1.1"
once_cell = "1"
linked-hash-map = "0.5.6"
lru = "0.11.1"
howlong = { version = "0.1", optional = true }
rio_turtle = "0.8.4"
rio_xml = "0.8.4"
reqwest = "0.11.18"
regex = "1.9.5"

[dev-dependencies]
arbitrary = { version = "1", features = ["derive"] }
Expand Down
2 changes: 1 addition & 1 deletion nemo-physical/src/arithmetic/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ impl CheckedPow for usize {

impl CheckedPow for u8 {
fn checked_pow(self, exponent: Self) -> Option<Self> {
num::checked_pow(self, exponent.try_into().ok()?)
num::checked_pow(self, exponent.into())
}
}

Expand Down
8 changes: 7 additions & 1 deletion nemo-physical/src/builder_proxy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,13 @@ impl ColumnBuilderProxy<PhysicalString> for PhysicalStringColumnBuilderProxy<'_>
generic_trait_impl_without_add!(VecT::U64);
fn add(&mut self, input: PhysicalString) -> Result<(), ReadingError> {
self.commit();
self.value = Some(self.dict.borrow_mut().add(input.into()).try_into()?);
self.value = Some(
self.dict
.borrow_mut()
.add_string(input.into())
.value()
.try_into()?,
);
Ok(())
}
}
Expand Down
9 changes: 7 additions & 2 deletions nemo-physical/src/datatypes/data_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,12 @@ impl DataValueT {
match self {
Self::String(val) => {
// dictionary indices
StorageValueT::U64(dict.add(val.clone().into()).try_into().unwrap())
StorageValueT::U64(
dict.add_string(val.clone().into())
.value()
.try_into()
.unwrap(),
)
}
Self::U32(val) => StorageValueT::U32(*val),
Self::U64(val) => StorageValueT::U64(*val),
Expand All @@ -88,7 +93,7 @@ impl DataValueT {
match self {
Self::String(val) => Some(StorageValueT::U64(
// dictionary indices
dict.index_of(val.into())?.try_into().unwrap(),
dict.fetch_id(val.into())?.try_into().unwrap(),
)),
Self::U32(val) => Some(StorageValueT::U32(*val)),
Self::U64(val) => Some(StorageValueT::U64(*val)),
Expand Down
Loading

0 comments on commit 7b544a1

Please sign in to comment.