Skip to content

Commit

Permalink
Merge pull request #109 from JaeseungYeom/all_fix
Browse files Browse the repository at this point in the history
improve hash distribution with short strings
  • Loading branch information
JaeseungYeom authored Feb 15, 2024
2 parents d041954 + 8f49ec3 commit 24406a0
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 15 deletions.
17 changes: 14 additions & 3 deletions src/dyad/core/dyad_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,28 @@ static int gen_path_key (const char* restrict str,
uint32_t hash[4] = {0u}; // Output for the hash
size_t cx = 0ul;
int n = 0;
size_t str_len = strlen (str);
const char* str_long = str;

if (str == NULL || path_key == NULL || len == 0ul) {
if (str == NULL || path_key == NULL || len == 0ul || str_len == 0ul) {
DYAD_C_FUNCTION_END();
return -1;
}
path_key[0] = '\0';

// Just append the string so that it can be as large as 128 bytes.
if (str_len < 128ul) {
char buf[256] = {'\0'};
memcpy (buf, str, str_len);
memset (buf + str_len, '@', 128ul - str_len);
buf[128u] = '\0';
str_len = 128ul;
str_long = buf;
}

for (uint32_t d = 0u; d < depth; d++) {
seed += seeds[d % 10];
// TODO add assert that str is not NULL
MurmurHash3_x64_128 (str, strlen (str), seed, hash);
MurmurHash3_x64_128 (str_long, str_len, seed, hash);
uint32_t bin = (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) % width;
n = snprintf (path_key + cx, len - cx, "%x.", bin);
cx += n;
Expand Down
8 changes: 8 additions & 0 deletions src/dyad/utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ add_executable(test_cmp_canonical_path_prefix test_cmp_canonical_path_prefix.c
${CMAKE_CURRENT_SOURCE_DIR}/../common/dyad_structures.h)
target_compile_definitions(test_cmp_canonical_path_prefix PUBLIC DYAD_HAS_CONFIG)
target_link_libraries(test_cmp_canonical_path_prefix PUBLIC ${PROJECT_NAME}_utils)

add_executable(test_murmur3 test_murmur3.c)
target_compile_definitions(test_murmur3 PUBLIC DYAD_HAS_CONFIG)
target_link_libraries(test_murmur3 PUBLIC ${PROJECT_NAME}_murmur3)

if(DYAD_LOGGER STREQUAL "CPP_LOGGER")
target_link_libraries(test_cmp_canonical_path_prefix PRIVATE ${CPP_LOGGER_LIBRARIES})
endif()
Expand All @@ -49,6 +54,9 @@ endif()

if (TARGET DYAD_C_FLAGS_werror)
target_link_libraries(${PROJECT_NAME}_utils PRIVATE DYAD_C_FLAGS_werror)
target_link_libraries(${PROJECT_NAME}_murmur3 PRIVATE DYAD_C_FLAGS_werror)
target_link_libraries(test_murmur3 PRIVATE DYAD_C_FLAGS_werror)
target_link_libraries(test_cmp_canonical_path_prefix PRIVATE DYAD_C_FLAGS_werror)
endif ()

install(
Expand Down
76 changes: 76 additions & 0 deletions src/dyad/utils/test_murmur3.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#include "dyad/utils/murmur3.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <limits.h>

static int gen_path_key (const char* restrict str,
char* restrict path_key,
const size_t len,
const uint32_t depth,
const uint32_t width)
{
static const uint32_t seeds[10] =
{104677u, 104681u, 104683u, 104693u, 104701u, 104707u, 104711u, 104717u, 104723u, 104729u};

uint32_t seed = 57u;
uint32_t hash[4] = {0u}; // Output for the hash
size_t cx = 0ul;
int n = 0;
size_t str_len = strlen (str);
const char* str_long = str;

if (str == NULL || path_key == NULL || len == 0ul || str_len == 0ul) {
return -1;
}
path_key[0] = '\0';

#if 1
// Just append the string so that it can be as large as 128 bytes.
if (str_len < 128ul) {
char buf[256] = {'\0'};
memcpy (buf, str, str_len);
memset (buf + str_len, '@', 128ul - str_len);
buf[128u] = '\0';
str_len = 128ul;
str_long = buf;
}
#endif

for (uint32_t d = 0u; d < depth; d++) {
seed += seeds[d % 10];
MurmurHash3_x64_128 (str_long, str_len, seed, hash);
uint32_t bin = (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) % width;
n = snprintf (path_key + cx, len - cx, "%x.", bin);
//n = snprintf (path_key + cx, len - cx, "%x%x%x%x.", hash[0], hash[1], hash[2], hash[3]);
cx += n;
if (cx >= len || n < 0) {
return -1;
}
}
n = snprintf (path_key + cx, len - cx, "%s", str);
if (cx + n >= len || n < 0) {
return -1;
}

return 0;
}


int main (int argc, char** argv)
{
if (argc < 4) {
printf ("Usage: %s depth width str1 [str2 [str3 ...]]\n", argv[0]);
return EXIT_FAILURE;
}

int depth = atoi (argv[1]);
int width = atoi (argv[2]);
for (int i = 3; i < argc; i++) {
char path_key [PATH_MAX + 1] = {'\0'};
gen_path_key (argv[i], path_key, PATH_MAX, depth, width);
printf("%s\t%s\n", argv[i], path_key);
}

return EXIT_SUCCESS;
}
43 changes: 31 additions & 12 deletions src/dyad/utils/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,32 +64,51 @@
uint32_t hash_str (const char* str, const uint32_t seed)
{
if (!str) return 0u;
const size_t len = strlen (str);
if (len == 0ul) return 0u;
const char* str_long = str;
size_t str_len = strlen (str);
if (str_len == 0ul) return 0u;

// Just append the string so that it can be as large as 128 bytes.
if (str_len < 128ul) {
char buf[256] = {'\0'};
memcpy (buf, str, str_len);
memset (buf + str_len, '@', 128ul - str_len);
buf[128u] = '\0';
str_len = 128ul;
str_long = buf;
}

uint32_t hash[4] = {0u}; // Output for the hash
MurmurHash3_x64_128 (str, strlen (str), seed, hash);
MurmurHash3_x64_128 (str_long, str_len, seed, hash);
return (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) + 1;
}

/** If hashing is not possible, returns 0. Otherwise, returns a non-zero hash value.
* This does not check if the length of string is correct, but simply use it */
* This only hashes the prefix of a given length */
uint32_t hash_path_prefix (const char* str, const uint32_t seed,
const size_t len)
{
char strbuf [PATH_MAX+1] = {'\0'};
uint32_t hash[4] = {0u}; // Output for the first hash with len1

if (!str || len == 0ul) {
return 0u;
}
const char* str_long = str;
size_t str_len = strlen (str);

memcpy (strbuf, str, (len > PATH_MAX)? PATH_MAX : len);
const size_t buf_len = strlen (strbuf);
if (buf_len != len) {
return 0u;
if (str_len < len) return 0u;
str_len = len;

// Just append the string so that it can be as large as 128 bytes.
if (len < 128ul) {
char buf[256] = {'\0'};
memcpy (buf, str, len);
memset (buf + len, '@', 128ul - len);
buf[128u] = '\0';
str_len = 128ul;
str_long = buf;
}
MurmurHash3_x64_128 (str, buf_len, seed, hash);

uint32_t hash[4] = {0u}; // Output for the hash
MurmurHash3_x64_128 (str_long, str_len, seed, hash);
return (hash[0] ^ hash[1] ^ hash[2] ^ hash[3]) + 1;
}

Expand Down

0 comments on commit 24406a0

Please sign in to comment.