From 713b18709dfbe37c0b62f83fa7ca834aa0527fd6 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Mon, 5 Apr 2021 00:55:51 +0200 Subject: [PATCH] Use XXH3-128 for duplicate detection --- include/dwarfs/checksum.h | 3 +++ include/dwarfs/entry.h | 2 +- src/dwarfs/checksum.cpp | 40 +++++++++++++++++++++++++++++++++++++++ src/dwarfs/entry.cpp | 2 +- 4 files changed, 45 insertions(+), 2 deletions(-) diff --git a/include/dwarfs/checksum.h b/include/dwarfs/checksum.h index 453f95adc..cc646c41f 100644 --- a/include/dwarfs/checksum.h +++ b/include/dwarfs/checksum.h @@ -35,6 +35,7 @@ class checksum { SHA1, SHA2_512_256, XXH3_64, + XXH3_128, }; static constexpr size_t digest_size(algorithm alg) { @@ -45,6 +46,8 @@ class checksum { return 32; case algorithm::XXH3_64: return 8; + case algorithm::XXH3_128: + return 16; } DWARFS_CHECK(false, "unknown algorithm"); } diff --git a/include/dwarfs/entry.h b/include/dwarfs/entry.h index 8a38de929..a31faa661 100644 --- a/include/dwarfs/entry.h +++ b/include/dwarfs/entry.h @@ -137,7 +137,7 @@ class file : public entry { private: struct data { - using hash_type = std::array; + using hash_type = std::array; hash_type hash{0}; uint32_t refcount{1}; std::optional inode_num; diff --git a/src/dwarfs/checksum.cpp b/src/dwarfs/checksum.cpp index 822572abd..39e4deda3 100644 --- a/src/dwarfs/checksum.cpp +++ b/src/dwarfs/checksum.cpp @@ -49,6 +49,14 @@ bool compute_xxh3_64(void const* data, size_t size, void* digest) { return true; } +bool compute_xxh3_128(void const* data, size_t size, void* digest) { + auto hash = XXH3_128bits(data, size); + static_assert(checksum::digest_size(checksum::algorithm::XXH3_128) == + sizeof(hash)); + ::memcpy(digest, &hash, sizeof(hash)); + return true; +} + class checksum_evp : public checksum::impl { public: checksum_evp(EVP_MD const* evp, checksum::algorithm alg) @@ -109,6 +117,32 @@ class checksum_xxh3_64 : public checksum::impl { XXH3_state_t* state_; }; +class checksum_xxh3_128 : public checksum::impl { + public: + checksum_xxh3_128() + : state_(XXH3_createState()) { + DWARFS_CHECK(XXH3_128bits_reset(state_) == XXH_OK, + "XXH3_128bits_reset() failed"); + } + + ~checksum_xxh3_128() override { XXH3_freeState(state_); } + + void update(void const* data, size_t size) override { + auto err = XXH3_128bits_update(state_, data, size); + DWARFS_CHECK(err == XXH_OK, fmt::format("XXH3_128bits_update() failed: {}", + static_cast(err))); + } + + bool finalize(void* digest) override { + auto hash = XXH3_128bits_digest(state_); + ::memcpy(digest, &hash, sizeof(hash)); + return true; + } + + private: + XXH3_state_t* state_; +}; + } // namespace bool checksum::compute(algorithm alg, void const* data, size_t size, @@ -126,6 +160,9 @@ bool checksum::compute(algorithm alg, void const* data, size_t size, case algorithm::XXH3_64: rv = compute_xxh3_64(data, size, digest); break; + case algorithm::XXH3_128: + rv = compute_xxh3_128(data, size, digest); + break; } if (rv && dig_size > 0) { @@ -157,6 +194,9 @@ checksum::checksum(algorithm alg) case algorithm::XXH3_64: impl_ = std::make_unique(); break; + case algorithm::XXH3_128: + impl_ = std::make_unique(); + break; default: DWARFS_CHECK(false, "unknown algorithm"); break; diff --git a/src/dwarfs/entry.cpp b/src/dwarfs/entry.cpp index 6988f519c..2f42411e4 100644 --- a/src/dwarfs/entry.cpp +++ b/src/dwarfs/entry.cpp @@ -168,7 +168,7 @@ void file::scan(os_access& os, progress& prog) { } void file::scan(std::shared_ptr const& mm, progress& prog) { - constexpr auto alg = checksum::algorithm::SHA1; + constexpr auto alg = checksum::algorithm::XXH3_128; static_assert(checksum::digest_size(alg) == sizeof(data::hash_type)); if (size_t s = size(); s > 0) {