From 49fe0b0b277cafdcad6b9365c6f1f3a86d279296 Mon Sep 17 00:00:00 2001 From: Jae-Seung Yeom Date: Sun, 11 Feb 2024 23:55:45 -0800 Subject: [PATCH 1/2] improve hash distribution with short strings --- src/dyad/core/dyad_core.c | 27 +++++++++-- src/dyad/utils/CMakeLists.txt | 8 ++++ src/dyad/utils/test_murmur3.c | 84 +++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 src/dyad/utils/test_murmur3.c diff --git a/src/dyad/core/dyad_core.c b/src/dyad/core/dyad_core.c index 5c235ac9..71c4ba78 100644 --- a/src/dyad/core/dyad_core.c +++ b/src/dyad/core/dyad_core.c @@ -48,17 +48,38 @@ static int gen_path_key (const char* restrict str, uint32_t hash[4] = {0u}; // Output for the hash size_t cx = 0ul; int n = 0; + size_t str_len = strlen (str); - if (str == NULL || path_key == NULL || len == 0ul) { + if (str == NULL || path_key == NULL || len == 0ul || str_len == 0ul) { DYAD_C_FUNCTION_END(); return -1; } path_key[0] = '\0'; + const char* str_long = str; + +#if 1 + // Strings shorter than 128 bytes collide. Especially, the hash value seems + // to depend on the length of such a string. + // For such a short string, we concatenate it as many times as needed to make + // it longer than 128 bytes. + if (str_len < 128ul) { + char buf[PATH_MAX+1] = {'\0'}; + memcpy (buf, str, str_len); + char* str_pos = buf + str_len; + const char* const str_min = buf + 128ul; + while (str_pos < str_min) { + memcpy (str_pos, str, str_len); + str_pos += str_len; + }; + str_len = str_pos - buf; + str_long = buf; + } +#endif + for (uint32_t d = 0u; d < depth; d++) { seed += seeds[d % 10]; - // TODO add assert that str is not NULL - MurmurHash3_x64_128 (str, strlen (str), seed, hash); + MurmurHash3_x64_128 (str_long, str_len, seed, hash); uint32_t bin = (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) % width; n = snprintf (path_key + cx, len - cx, "%x.", bin); cx += n; diff --git a/src/dyad/utils/CMakeLists.txt b/src/dyad/utils/CMakeLists.txt index 4484ddf4..f3f958ee 100644 --- a/src/dyad/utils/CMakeLists.txt +++ b/src/dyad/utils/CMakeLists.txt @@ -39,6 +39,11 @@ add_executable(test_cmp_canonical_path_prefix test_cmp_canonical_path_prefix.c ${CMAKE_CURRENT_SOURCE_DIR}/../common/dyad_structures.h) target_compile_definitions(test_cmp_canonical_path_prefix PUBLIC DYAD_HAS_CONFIG) target_link_libraries(test_cmp_canonical_path_prefix PUBLIC ${PROJECT_NAME}_utils) + +add_executable(test_murmur3 test_murmur3.c) +target_compile_definitions(test_murmur3 PUBLIC DYAD_HAS_CONFIG) +target_link_libraries(test_murmur3 PUBLIC ${PROJECT_NAME}_murmur3) + if(DYAD_LOGGER STREQUAL "CPP_LOGGER") target_link_libraries(test_cmp_canonical_path_prefix PRIVATE ${CPP_LOGGER_LIBRARIES}) endif() @@ -49,6 +54,9 @@ endif() if (TARGET DYAD_C_FLAGS_werror) target_link_libraries(${PROJECT_NAME}_utils PRIVATE DYAD_C_FLAGS_werror) + target_link_libraries(${PROJECT_NAME}_murmur3 PRIVATE DYAD_C_FLAGS_werror) + target_link_libraries(test_murmur3 PRIVATE DYAD_C_FLAGS_werror) + target_link_libraries(test_cmp_canonical_path_prefix PRIVATE DYAD_C_FLAGS_werror) endif () install( diff --git a/src/dyad/utils/test_murmur3.c b/src/dyad/utils/test_murmur3.c new file mode 100644 index 00000000..b720fb49 --- /dev/null +++ b/src/dyad/utils/test_murmur3.c @@ -0,0 +1,84 @@ +#include "dyad/utils/murmur3.h" +#include +#include +#include +#include + +static int gen_path_key (const char* restrict str, + char* restrict path_key, + const size_t len, + const uint32_t depth, + const uint32_t width) +{ + static const uint32_t seeds[10] = + {104677u, 104681u, 104683u, 104693u, 104701u, 104707u, 104711u, 104717u, 104723u, 104729u}; + + uint32_t seed = 57u; + uint32_t hash[4] = {0u}; // Output for the hash + size_t cx = 0ul; + int n = 0; + size_t str_len = strlen (str); + + if (str == NULL || path_key == NULL || len == 0ul || str_len == 0ul) { + return -1; + } + path_key[0] = '\0'; + + const char* str_long = str; + +#if 1 + // Strings shorter than 128 bytes collide. Especially, the hash value seems + // to depend on the length of such a string. + // For such a short string, we concatenate it as many times as needed to make + // it longer than 128 bytes. + if (str_len < 128ul) { + char buf[PATH_MAX+1] = {'\0'}; + memcpy (buf, str, str_len); + char* str_pos = buf + str_len; + const char* const str_min = buf + 128ul; + while (str_pos < str_min) { + memcpy (str_pos, str, str_len); + str_pos += str_len; + }; + str_len = str_pos - buf; + str_long = buf; + } +#endif + + for (uint32_t d = 0u; d < depth; d++) { + seed += seeds[d % 10]; + MurmurHash3_x64_128 (str_long, str_len, seed, hash); + uint32_t bin = (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) % width; + n = snprintf (path_key + cx, len - cx, "%x.", bin); + //n = snprintf (path_key + cx, len - cx, "%x%x%x%x.", hash[0], hash[1], hash[2], hash[3]); + cx += n; + if (cx >= len || n < 0) { + return -1; + } + } + n = snprintf (path_key + cx, len - cx, "%s", str); + if (cx + n >= len || n < 0) { + return -1; + } + + return 0; +} + + +int main (int argc, char** argv) +{ + if (argc < 4) { + printf ("Usage: %s depth width str1 [str2 [str3 ...]]\n", argv[0]); + return EXIT_FAILURE; + } + + int depth = atoi (argv[1]); + int width = atoi (argv[2]); + for (int i = 3; i < argc; i++) { + char path_key [PATH_MAX + 1] = {'\0'}; + gen_path_key (argv[i], path_key, PATH_MAX, depth, width); + printf("%s\t%s\n", argv[i], path_key); + } + + return EXIT_SUCCESS; +} From 8f49ec3cf133e96bf894b5ef4930e62a7c20e94b Mon Sep 17 00:00:00 2001 From: Jae-Seung Yeom Date: Wed, 14 Feb 2024 02:50:35 -0800 Subject: [PATCH 2/2] redo the minimum hashable string length --- src/dyad/core/dyad_core.c | 22 +++++------------- src/dyad/utils/test_murmur3.c | 20 +++++----------- src/dyad/utils/utils.c | 43 +++++++++++++++++++++++++---------- 3 files changed, 43 insertions(+), 42 deletions(-) diff --git a/src/dyad/core/dyad_core.c b/src/dyad/core/dyad_core.c index 71c4ba78..f7d706fe 100644 --- a/src/dyad/core/dyad_core.c +++ b/src/dyad/core/dyad_core.c @@ -49,6 +49,7 @@ static int gen_path_key (const char* restrict str, size_t cx = 0ul; int n = 0; size_t str_len = strlen (str); + const char* str_long = str; if (str == NULL || path_key == NULL || len == 0ul || str_len == 0ul) { DYAD_C_FUNCTION_END(); @@ -56,26 +57,15 @@ static int gen_path_key (const char* restrict str, } path_key[0] = '\0'; - const char* str_long = str; - -#if 1 - // Strings shorter than 128 bytes collide. Especially, the hash value seems - // to depend on the length of such a string. - // For such a short string, we concatenate it as many times as needed to make - // it longer than 128 bytes. + // Just append the string so that it can be as large as 128 bytes. if (str_len < 128ul) { - char buf[PATH_MAX+1] = {'\0'}; + char buf[256] = {'\0'}; memcpy (buf, str, str_len); - char* str_pos = buf + str_len; - const char* const str_min = buf + 128ul; - while (str_pos < str_min) { - memcpy (str_pos, str, str_len); - str_pos += str_len; - }; - str_len = str_pos - buf; + memset (buf + str_len, '@', 128ul - str_len); + buf[128u] = '\0'; + str_len = 128ul; str_long = buf; } -#endif for (uint32_t d = 0u; d < depth; d++) { seed += seeds[d % 10]; diff --git a/src/dyad/utils/test_murmur3.c b/src/dyad/utils/test_murmur3.c index b720fb49..9bb64f78 100644 --- a/src/dyad/utils/test_murmur3.c +++ b/src/dyad/utils/test_murmur3.c @@ -18,29 +18,21 @@ static int gen_path_key (const char* restrict str, size_t cx = 0ul; int n = 0; size_t str_len = strlen (str); + const char* str_long = str; if (str == NULL || path_key == NULL || len == 0ul || str_len == 0ul) { return -1; } path_key[0] = '\0'; - const char* str_long = str; - #if 1 - // Strings shorter than 128 bytes collide. Especially, the hash value seems - // to depend on the length of such a string. - // For such a short string, we concatenate it as many times as needed to make - // it longer than 128 bytes. + // Just append the string so that it can be as large as 128 bytes. if (str_len < 128ul) { - char buf[PATH_MAX+1] = {'\0'}; + char buf[256] = {'\0'}; memcpy (buf, str, str_len); - char* str_pos = buf + str_len; - const char* const str_min = buf + 128ul; - while (str_pos < str_min) { - memcpy (str_pos, str, str_len); - str_pos += str_len; - }; - str_len = str_pos - buf; + memset (buf + str_len, '@', 128ul - str_len); + buf[128u] = '\0'; + str_len = 128ul; str_long = buf; } #endif diff --git a/src/dyad/utils/utils.c b/src/dyad/utils/utils.c index 4ef940fb..03184052 100644 --- a/src/dyad/utils/utils.c +++ b/src/dyad/utils/utils.c @@ -64,32 +64,51 @@ uint32_t hash_str (const char* str, const uint32_t seed) { if (!str) return 0u; - const size_t len = strlen (str); - if (len == 0ul) return 0u; + const char* str_long = str; + size_t str_len = strlen (str); + if (str_len == 0ul) return 0u; + + // Just append the string so that it can be as large as 128 bytes. + if (str_len < 128ul) { + char buf[256] = {'\0'}; + memcpy (buf, str, str_len); + memset (buf + str_len, '@', 128ul - str_len); + buf[128u] = '\0'; + str_len = 128ul; + str_long = buf; + } uint32_t hash[4] = {0u}; // Output for the hash - MurmurHash3_x64_128 (str, strlen (str), seed, hash); + MurmurHash3_x64_128 (str_long, str_len, seed, hash); return (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) + 1; } /** If hashing is not possible, returns 0. Otherwise, returns a non-zero hash value. - * This does not check if the length of string is correct, but simply use it */ + * This only hashes the prefix of a given length */ uint32_t hash_path_prefix (const char* str, const uint32_t seed, const size_t len) { - char strbuf [PATH_MAX+1] = {'\0'}; - uint32_t hash[4] = {0u}; // Output for the first hash with len1 - if (!str || len == 0ul) { return 0u; } + const char* str_long = str; + size_t str_len = strlen (str); - memcpy (strbuf, str, (len > PATH_MAX)? PATH_MAX : len); - const size_t buf_len = strlen (strbuf); - if (buf_len != len) { - return 0u; + if (str_len < len) return 0u; + str_len = len; + + // Just append the string so that it can be as large as 128 bytes. + if (len < 128ul) { + char buf[256] = {'\0'}; + memcpy (buf, str, len); + memset (buf + len, '@', 128ul - len); + buf[128u] = '\0'; + str_len = 128ul; + str_long = buf; } - MurmurHash3_x64_128 (str, buf_len, seed, hash); + + uint32_t hash[4] = {0u}; // Output for the hash + MurmurHash3_x64_128 (str_long, str_len, seed, hash); return (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) + 1; }