Skip to content

Commit

Permalink
Use XXH3-128 for duplicate detection
Browse files Browse the repository at this point in the history
  • Loading branch information
mhx committed Apr 4, 2021
1 parent cbbd48b commit 713b187
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 2 deletions.
3 changes: 3 additions & 0 deletions include/dwarfs/checksum.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class checksum {
SHA1,
SHA2_512_256,
XXH3_64,
XXH3_128,
};

static constexpr size_t digest_size(algorithm alg) {
Expand All @@ -45,6 +46,8 @@ class checksum {
return 32;
case algorithm::XXH3_64:
return 8;
case algorithm::XXH3_128:
return 16;
}
DWARFS_CHECK(false, "unknown algorithm");
}
Expand Down
2 changes: 1 addition & 1 deletion include/dwarfs/entry.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ class file : public entry {

private:
struct data {
using hash_type = std::array<char, 20>;
using hash_type = std::array<char, 16>;
hash_type hash{0};
uint32_t refcount{1};
std::optional<uint32_t> inode_num;
Expand Down
40 changes: 40 additions & 0 deletions src/dwarfs/checksum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@ bool compute_xxh3_64(void const* data, size_t size, void* digest) {
return true;
}

bool compute_xxh3_128(void const* data, size_t size, void* digest) {
auto hash = XXH3_128bits(data, size);
static_assert(checksum::digest_size(checksum::algorithm::XXH3_128) ==
sizeof(hash));
::memcpy(digest, &hash, sizeof(hash));
return true;
}

class checksum_evp : public checksum::impl {
public:
checksum_evp(EVP_MD const* evp, checksum::algorithm alg)
Expand Down Expand Up @@ -109,6 +117,32 @@ class checksum_xxh3_64 : public checksum::impl {
XXH3_state_t* state_;
};

class checksum_xxh3_128 : public checksum::impl {
public:
checksum_xxh3_128()
: state_(XXH3_createState()) {
DWARFS_CHECK(XXH3_128bits_reset(state_) == XXH_OK,
"XXH3_128bits_reset() failed");
}

~checksum_xxh3_128() override { XXH3_freeState(state_); }

void update(void const* data, size_t size) override {
auto err = XXH3_128bits_update(state_, data, size);
DWARFS_CHECK(err == XXH_OK, fmt::format("XXH3_128bits_update() failed: {}",
static_cast<int>(err)));
}

bool finalize(void* digest) override {
auto hash = XXH3_128bits_digest(state_);
::memcpy(digest, &hash, sizeof(hash));
return true;
}

private:
XXH3_state_t* state_;
};

} // namespace

bool checksum::compute(algorithm alg, void const* data, size_t size,
Expand All @@ -126,6 +160,9 @@ bool checksum::compute(algorithm alg, void const* data, size_t size,
case algorithm::XXH3_64:
rv = compute_xxh3_64(data, size, digest);
break;
case algorithm::XXH3_128:
rv = compute_xxh3_128(data, size, digest);
break;
}

if (rv && dig_size > 0) {
Expand Down Expand Up @@ -157,6 +194,9 @@ checksum::checksum(algorithm alg)
case algorithm::XXH3_64:
impl_ = std::make_unique<checksum_xxh3_64>();
break;
case algorithm::XXH3_128:
impl_ = std::make_unique<checksum_xxh3_128>();
break;
default:
DWARFS_CHECK(false, "unknown algorithm");
break;
Expand Down
2 changes: 1 addition & 1 deletion src/dwarfs/entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ void file::scan(os_access& os, progress& prog) {
}

void file::scan(std::shared_ptr<mmif> const& mm, progress& prog) {
constexpr auto alg = checksum::algorithm::SHA1;
constexpr auto alg = checksum::algorithm::XXH3_128;
static_assert(checksum::digest_size(alg) == sizeof(data::hash_type));

if (size_t s = size(); s > 0) {
Expand Down

0 comments on commit 713b187

Please sign in to comment.