Skip to content

Commit

Permalink
Added more hash
Browse files Browse the repository at this point in the history
  • Loading branch information
LucaCappelletti94 committed Aug 24, 2024
1 parent e0775e1 commit bc92d5b
Show file tree
Hide file tree
Showing 18 changed files with 204 additions and 1 deletion.
2 changes: 2 additions & 0 deletions measure_gaps/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ version = "0.1.0"
edition = "2021"

[dependencies]
ahash = "0.8.11"
hyperloglog-rs = {path="../", features=["all_precisions"]}
indicatif = { version = "0.17.8", features = ["rayon"] }
rayon = "1.10.0"
serde = { version = "1.0.208", features = ["derive"] }
test_utils = {path="../test_utils"}
twox-hash = "1.6.3"
wyhash = "0.5.0"
87 changes: 87 additions & 0 deletions measure_gaps/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,90 @@ gap,count
5,123
```

## Some results for XXhasher

| Precision | Bits | Hash | Entropy | Improvement |
|-----------|------|------|-----------|-------------|
| 18 | 5 | 3 | 9.4753 | 2.5329 |
| 18 | 4 | 3 | 9.8542 | 2.4355 |
| 17 | 6 | 3 | 10.1821 | 2.3571 |
| 17 | 5 | 3 | 10.4739 | 2.2914 |
| 17 | 4 | 3 | 10.8532 | 2.2113 |
| 16 | 6 | 3 | 11.1812 | 2.1465 |
| 16 | 5 | 3 | 11.4729 | 2.0919 |
| 16 | 4 | 3 | 11.8525 | 2.0249 |
| 15 | 6 | 3 | 12.1806 | 1.9703 |
| 4 | 4 | 4 | 16.6096 | 1.9266 |
| 15 | 5 | 3 | 12.4724 | 1.9242 |
| 15 | 4 | 3 | 12.8520 | 1.8674 |
| 14 | 6 | 3 | 13.1801 | 1.8209 |
| 18 | 6 | 4 | 17.5954 | 1.8187 |
| 10 | 5 | 2 | 8.8860 | 1.8006 |
| 18 | 5 | 4 | 17.8870 | 1.7890 |
| 14 | 5 | 3 | 13.4718 | 1.7815 |
| 5 | 4 | 4 | 18.1944 | 1.7588 |
| 4 | 6 | 4 | 18.1944 | 1.7588 |
| 4 | 5 | 4 | 18.1944 | 1.7588 |
| 18 | 4 | 4 | 18.2666 | 1.7518 |
| 14 | 4 | 3 | 13.8516 | 1.7326 |
| 10 | 4 | 2 | 9.2638 | 1.7272 |
| 17 | 6 | 4 | 18.5949 | 1.7209 |
| 17 | 5 | 4 | 18.8863 | 1.6944 |
| 13 | 6 | 3 | 14.1793 | 1.6926 |
| 5 | 6 | 4 | 18.9311 | 1.6903 |
| 5 | 5 | 4 | 18.9311 | 1.6903 |
| 9 | 6 | 2 | 9.5894 | 1.6685 |
| 17 | 4 | 4 | 19.2656 | 1.6610 |
| 13 | 5 | 3 | 14.4714 | 1.6584 |
| 6 | 4 | 4 | 19.4162 | 1.6481 |
| 16 | 6 | 4 | 19.5932 | 1.6332 |
| 9 | 5 | 2 | 9.8799 | 1.6194 |
| 6 | 5 | 4 | 19.7781 | 1.6179 |
| 13 | 4 | 3 | 14.8505 | 1.6161 |
| 16 | 5 | 4 | 19.8840 | 1.6093 |
| 6 | 6 | 4 | 20.0669 | 1.5947 |
| 12 | 6 | 3 | 15.1777 | 1.5813 |
| 16 | 4 | 4 | 20.2622 | 1.5793 |
| 7 | 4 | 4 | 20.5130 | 1.5600 |
| 9 | 4 | 2 | 10.2567 | 1.5599 |
| 15 | 6 | 4 | 20.5877 | 1.5543 |
| 12 | 5 | 3 | 15.4693 | 1.5515 |
| 7 | 5 | 4 | 20.8517 | 1.5346 |
| 15 | 5 | 4 | 20.8763 | 1.5328 |
| 4 | 4 | 1 | 5.2244 | 1.5313 |
| 7 | 6 | 4 | 21.1245 | 1.5148 |
| 12 | 4 | 3 | 15.8485 | 1.5143 |
| 8 | 6 | 2 | 10.5805 | 1.5122 |
| 15 | 4 | 4 | 21.2507 | 1.5058 |
| 8 | 4 | 4 | 21.5495 | 1.4850 |
| 11 | 6 | 3 | 16.1730 | 1.4840 |
| 14 | 6 | 4 | 21.5689 | 1.4836 |
| 8 | 5 | 2 | 10.8693 | 1.4720 |
| 14 | 5 | 4 | 21.8505 | 1.4645 |
| 8 | 5 | 4 | 21.8715 | 1.4631 |
| 11 | 5 | 3 | 16.4648 | 1.4577 |
| 8 | 6 | 4 | 22.1295 | 1.4460 |
| 14 | 4 | 4 | 22.2126 | 1.4406 |
| 11 | 4 | 3 | 16.8397 | 1.4252 |
| 8 | 4 | 2 | 11.2439 | 1.4230 |
| 13 | 6 | 4 | 22.5083 | 1.4217 |
| 9 | 4 | 4 | 22.5298 | 1.4203 |
| 13 | 5 | 4 | 22.7678 | 1.4055 |
| 9 | 5 | 4 | 22.8206 | 1.4022 |
| 10 | 6 | 3 | 17.1591 | 1.3987 |
| 9 | 6 | 4 | 23.0445 | 1.3886 |
| 13 | 4 | 4 | 23.0920 | 1.3858 |
| 7 | 6 | 2 | 11.5636 | 1.3836 |
| 10 | 5 | 3 | 17.4459 | 1.3757 |
| 12 | 6 | 4 | 23.3225 | 1.3721 |
| 10 | 4 | 4 | 23.3807 | 1.3687 |
| 12 | 5 | 4 | 23.5191 | 1.3606 |
| 10 | 5 | 4 | 23.8359 | 1.3446 |
| 12 | 4 | 4 | 23.9475 | 1.3383 |
| 11 | 6 | 4 | 24.1145 | 1.3294 |
| 11 | 5 | 4 | 24.3117 | 1.3213 |
| 11 | 4 | 4 | 24.7399 | 1.2998 |
| 10 | 6 | 4 | 24.7674 | 1.2983 |
| 10 | 4 | 3 | 18.2637 | 1.2867 |
| 6 | 5 | 1 | 8.1795 | 1.2838 |
| 10 | 5 | 1 | 7.8889 | 1.2798 |
96 changes: 96 additions & 0 deletions measure_gaps/plot_gaps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""Plot histograms of the hash gaps.
This script, for each precision considered in the analysis, i.e. the range 4 to 18,
loads the gzipped CSVs from the reports directory and plots the histograms of the gap counts
for each hash size and each bit size, and hasher used.
The names of the documents are of the format 'reports/gap_report_precision_{}_bits_{}_hash_{}_{}.csv.gz'
where the first placeholder is the precision, the second is the bit size, the third is the hash size,
and the fourth is the hasher used.
The layour expected is a grid with "number of hash sizes" columns and "number of bit sizes" rows.
Different precisions and different hasher are plotted in different figures.
"""

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import trange, tqdm


def plot(df: pd.DataFrame, precision: int, hasher: str):
"""Plot the histograms of the gap counts.
Parameters
----------
df : pd.DataFrame
The DataFrame containing the gap counts.
precision : int
The precision used in the analysis.
hasher : str
The hasher used in the analysis
"""

number_of_unique_hashes = df["hash_size"].nunique()
number_of_unique_bits = df["bit_size"].nunique()

fig, axes = plt.subplots(
number_of_unique_bits,
number_of_unique_hashes,
figsize=(5 * number_of_unique_hashes, 5 * number_of_unique_bits),
)

for i, hash_size in enumerate(df["hash_size"].unique()):
for j, bit_size in enumerate(df["bit_size"].unique()):
data = df[(df["hash_size"] == hash_size) & (df["bit_size"] == bit_size)]
ax = axes[j, i]

if data.empty:
# If the data is empty, we draw a red X in the plot
ax.text(0.5, 0.5, "X", fontsize=24, color="red", ha="center", va="center")
ax.axis("off")
continue

ax.set_title(f"Hash size: {hash_size}, Bit size: {bit_size}")
sns.histplot(
data,
x="gap",
ax=ax,
bins=500,
)

fig.suptitle(f"Precision: {precision}, Hasher: {hasher}")

plt.tight_layout()

plt.savefig(f"reports/gap_report_precision_{precision}_{hasher}.png")


def plot_all():
"""Plot all the histograms of the gap counts."""
hasher = "xxhash64"
sns.set_style("whitegrid")
for precision in trange(4, 19, desc="Precision"):
dataframes = []
for hash_size in tqdm([1, 2, 3, 4], desc="Hash size", leave=False):
for bit_size in tqdm([4, 5, 6], desc="Bit size", leave=False):
path = f"reports/gap_report_precision_{precision}_bits_{bit_size}_hash_{hash_size}_{hasher}.csv.gz"
if os.path.exists(path):
df = pd.read_csv(
path,
compression="gzip",
dtype={"gap": "int32", "count": "int64"},
)
df["hash_size"] = hash_size
df["bit_size"] = bit_size
dataframes.append(df)

df = pd.concat(dataframes)

plot(df, precision, hasher)


if __name__ == "__main__":
plot_all()
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
20 changes: 19 additions & 1 deletion measure_gaps/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ use serde::Serialize;
use std::collections::HashMap;
use test_utils::prelude::write_csv;
use twox_hash::XxHash64;
use wyhash::WyHash;
use ahash::AHasher;

#[derive(Serialize, Deserialize)]
struct GapReport {
Expand All @@ -20,6 +22,8 @@ struct GapReport {
count: u64,
}



fn measure_gaps<P: Precision, B: Bits, H: HasherType>(multiprogress: &MultiProgress)
where
P: ArrayRegister<B>,
Expand Down Expand Up @@ -127,9 +131,23 @@ where
/// bit size and hasher types.
macro_rules! generate_measure_gaps {
($multiprogress:ident, $precision:ty, $bit_size:ty, $($hasher:ty),*) => {
let progress_bar = $multiprogress.add(ProgressBar::new(3 as u64));

progress_bar.set_style(
ProgressStyle::default_bar()
.template("[{elapsed_precise} | {eta}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}")
.unwrap()
.progress_chars("##-"),
);

progress_bar.tick();

$(
measure_gaps::<$precision, $bit_size, $hasher>($multiprogress);
progress_bar.inc(1);
)*

progress_bar.finish();
};
}

Expand All @@ -149,7 +167,7 @@ macro_rules! generate_measure_gaps_for_precision {
progress_bar.tick();

$(
generate_measure_gaps!($multiprogress, $precision, $bit_size, XxHash64);
generate_measure_gaps!($multiprogress, $precision, $bit_size, WyHash, AHasher);
progress_bar.inc(1);
)*

Expand Down

0 comments on commit bc92d5b

Please sign in to comment.