From 21ac49dd6da657628c4cba8bc436f7760f58db41 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Mon, 18 Nov 2024 00:47:02 +0100 Subject: [PATCH] wip: case-insensitive lookup --- CMakeLists.txt | 9 +- cmake/dwarfs-config.cmake.in | 16 ++- cmake/libdwarfs.cmake | 2 +- include/dwarfs/reader/metadata_options.h | 1 + src/reader/internal/metadata_v2.cpp | 97 ++++++++++++-- test/dwarfs_test.cpp | 163 +++++++++++++++++++++++ vcpkg.json | 1 + 7 files changed, 274 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 51b6b4f39..9f5aaef40 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,7 @@ set(PARALLEL_HASHMAP_REQUIRED_VERSION 1.3.8) set(PARALLEL_HASHMAP_PREFERRED_VERSION 1.3.12) set(BOOST_REQUIRED_VERSION 1.67.0) +set(LIBICUUC_REQUIRED_VERSION 70.0) set(LIBCRYPTO_REQUIRED_VERSION 3.0.0) set(LIBLZ4_REQUIRED_VERSION 1.9.3) set(LIBLZMA_REQUIRED_VERSION 5.2.5) @@ -190,8 +191,13 @@ if(WITH_LIBDWARFS) OPTIONAL_COMPONENTS process) if(APPLE) + find_program(HOMEBREW_EXE brew) + execute_process( + COMMAND ${HOMEBREW_EXE} --prefix icu4c + OUTPUT_VARIABLE LIBICU4C_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE) + list(PREPEND CMAKE_PREFIX_PATH ${LIBICU4C_PREFIX}) if(USE_HOMEBREW_LIBARCHIVE) - find_program(HOMEBREW_EXE brew) execute_process( COMMAND ${HOMEBREW_EXE} --prefix libarchive OUTPUT_VARIABLE LIBARCHIVE_PREFIX @@ -204,6 +210,7 @@ if(WITH_LIBDWARFS) find_package(cpptrace REQUIRED CONFIG) endif() + pkg_check_modules(LIBICUUC REQUIRED IMPORTED_TARGET icu-uc>=${LIBICUUC_REQUIRED_VERSION}) pkg_check_modules(LIBCRYPTO REQUIRED IMPORTED_TARGET libcrypto>=${LIBCRYPTO_REQUIRED_VERSION}) pkg_check_modules(LIBARCHIVE REQUIRED IMPORTED_TARGET libarchive>=${LIBARCHIVE_REQUIRED_VERSION}) pkg_check_modules(XXHASH REQUIRED IMPORTED_TARGET libxxhash>=${XXHASH_REQUIRED_VERSION}) diff --git a/cmake/dwarfs-config.cmake.in b/cmake/dwarfs-config.cmake.in index e4ebb2eaa..b2b2252e7 100644 --- a/cmake/dwarfs-config.cmake.in +++ b/cmake/dwarfs-config.cmake.in @@ -9,13 +9,20 @@ set(DWARFS_PREFIX_DIR "${PACKAGE_PREFIX_DIR}") find_package(PkgConfig) -if(@APPLE@ AND @USE_HOMEBREW_LIBARCHIVE@) # APPLE AND USE_HOMEBREW_LIBARCHIVE +if(@APPLE@) # APPLE find_program(HOMEBREW_EXE brew) execute_process( - COMMAND ${HOMEBREW_EXE} --prefix libarchive - OUTPUT_VARIABLE LIBARCHIVE_PREFIX + COMMAND ${HOMEBREW_EXE} --prefix icu4c + OUTPUT_VARIABLE LIBICU4C_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE) - list(PREPEND CMAKE_PREFIX_PATH ${LIBARCHIVE_PREFIX}) + list(PREPEND CMAKE_PREFIX_PATH ${LIBICU4C_PREFIX}) + if(@USE_HOMEBREW_LIBARCHIVE@) # USE_HOMEBREW_LIBARCHIVE + execute_process( + COMMAND ${HOMEBREW_EXE} --prefix libarchive + OUTPUT_VARIABLE LIBARCHIVE_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE) + list(PREPEND CMAKE_PREFIX_PATH ${LIBARCHIVE_PREFIX}) + endif() endif() find_dependency(Threads REQUIRED) @@ -24,6 +31,7 @@ find_dependency(gflags CONFIG REQUIRED) find_dependency(Boost @BOOST_REQUIRED_VERSION@ REQUIRED CONFIG COMPONENTS chrono context filesystem iostreams program_options regex system thread OPTIONAL_COMPONENTS process) +pkg_check_modules(LIBICUUC REQUIRED IMPORTED_TARGET icu-uc>=@LIBICUUC_REQUIRED_VERSION@) pkg_check_modules(LIBCRYPTO REQUIRED IMPORTED_TARGET libcrypto>=@LIBCRYPTO_REQUIRED_VERSION@) pkg_check_modules(LIBARCHIVE REQUIRED IMPORTED_TARGET libarchive>=@LIBARCHIVE_REQUIRED_VERSION@) pkg_check_modules(XXHASH REQUIRED IMPORTED_TARGET libxxhash>=@XXHASH_REQUIRED_VERSION@) diff --git a/cmake/libdwarfs.cmake b/cmake/libdwarfs.cmake index 9907dff04..8c89179f3 100644 --- a/cmake/libdwarfs.cmake +++ b/cmake/libdwarfs.cmake @@ -162,7 +162,7 @@ add_cpp2_thrift_library(thrift/features.thrift TARGET dwarfs_features_thrift OUTPUT_PATH dwarfs) target_link_libraries(dwarfs_common PRIVATE dwarfs_folly_lite PkgConfig::LIBCRYPTO PkgConfig::XXHASH PkgConfig::ZSTD) -target_link_libraries(dwarfs_reader PUBLIC dwarfs_common) +target_link_libraries(dwarfs_reader PUBLIC dwarfs_common PkgConfig::LIBICUUC) target_link_libraries(dwarfs_writer PUBLIC dwarfs_common PkgConfig::ZSTD) target_link_libraries(dwarfs_extractor PUBLIC dwarfs_reader) target_link_libraries(dwarfs_rewrite PUBLIC dwarfs_reader dwarfs_writer) diff --git a/include/dwarfs/reader/metadata_options.h b/include/dwarfs/reader/metadata_options.h index 5bb42fd9a..e8e629b45 100644 --- a/include/dwarfs/reader/metadata_options.h +++ b/include/dwarfs/reader/metadata_options.h @@ -32,6 +32,7 @@ struct metadata_options { bool enable_nlink{false}; bool readonly{false}; bool check_consistency{false}; + bool case_insensitive_lookup{false}; size_t block_size{512}; std::optional fs_uid{}; std::optional fs_gid{}; diff --git a/src/reader/internal/metadata_v2.cpp b/src/reader/internal/metadata_v2.cpp index dec8ddd98..18d0f83d1 100644 --- a/src/reader/internal/metadata_v2.cpp +++ b/src/reader/internal/metadata_v2.cpp @@ -46,6 +46,8 @@ #include +#include + #include #include #include @@ -416,7 +418,7 @@ class metadata_ final : public metadata_v2::impl { , symlinks_(meta_.compact_symlinks() ? string_table(lgr, "symlinks", *meta_.compact_symlinks()) : string_table(meta_.symlinks())) - // clang-format off + , dir_icase_cache_{build_dir_icase_cache()} // clang-format off PERFMON_CLS_PROXY_INIT(perfmon, "metadata_v2") PERFMON_CLS_TIMER_INIT(find) PERFMON_CLS_TIMER_INIT(getattr) @@ -921,6 +923,54 @@ class metadata_ final : public metadata_v2::impl { return packed_nlinks; } + static std::string utf8_to_lower(std::string str) { + auto ustr = icu::UnicodeString::fromUTF8(str); + ustr.toLower(); + str.clear(); + ustr.toUTF8String(str); + return str; + } + + std::vector> build_dir_icase_cache() const { + std::vector> cache; + + if (options_.case_insensitive_lookup) { + auto td = LOG_TIMED_DEBUG; + size_t num_cached_dirs = 0; + size_t total_cache_size = 0; + + cache.reserve(meta_.directories().size()); + + for (uint32_t inode = 0; inode < meta_.directories().size() - 1; + ++inode) { + auto& pv = cache.emplace_back(); + directory_view dir{inode, global_}; + auto range = dir.entry_range(); + std::vector names(range.size()); + std::transform(range.begin(), range.end(), names.begin(), [&](auto ix) { + return utf8_to_lower(dir_entry_view_impl::name(ix, global_)); + }); + std::vector entries(range.size()); + std::iota(entries.begin(), entries.end(), 0); + std::sort(entries.begin(), entries.end(), + [&](auto a, auto b) { return names[a] < names[b]; }); + if (!std::is_sorted(entries.begin(), entries.end())) { + pv.reset(std::bit_width(entries.size()), entries.size()); + for (size_t i = 0; i < entries.size(); ++i) { + pv.set(i, entries[i]); + } + ++num_cached_dirs; + total_cache_size += pv.size_in_bytes(); + } + } + + td << "built case-insensitive directory cache for " << num_cached_dirs + << " directories (" << size_with_unit(total_cache_size) << ")"; + } + + return cache; + } + size_t total_file_entries() const { return (dev_inode_offset_ - file_inode_offset_) + (meta_.dir_entries() @@ -944,6 +994,7 @@ class metadata_ final : public metadata_v2::impl { const int unique_files_; const metadata_options options_; const string_table symlinks_; + std::vector> const dir_icase_cache_; PERFMON_CLS_PROXY_DECL PERFMON_CLS_TIMER_DECL(find) PERFMON_CLS_TIMER_DECL(getattr) @@ -1695,15 +1746,43 @@ metadata_::find(directory_view dir, std::string_view name) const { auto range = dir.entry_range(); - auto it = std::lower_bound( - range.begin(), range.end(), name, [&](auto ix, std::string_view name) { - return internal::dir_entry_view_impl::name(ix, global_) < name; - }); + if (options_.case_insensitive_lookup) { + auto const& cache = dir_icase_cache_[dir.inode()]; + auto ixr = boost::irange(0, range.size()); + auto key = utf8_to_lower(std::string(name)); + + auto it = std::lower_bound( + ixr.begin(), ixr.end(), key, [&](auto ix, std::string const& key) { + if (!cache.empty()) { + ix = cache[ix]; + } + return utf8_to_lower(internal::dir_entry_view_impl::name( + range[ix], global_)) < key; + }); + + if (it != ixr.end()) { + auto ix = *it; + if (!cache.empty()) { + ix = cache[ix]; + } + ix = range[ix]; + if (utf8_to_lower(internal::dir_entry_view_impl::name(ix, global_)) == + key) { + return dir_entry_view{dir_entry_view_impl::from_dir_entry_index_shared( + ix, global_.self_dir_entry(dir.inode()), global_)}; + } + } + } else { + auto it = std::lower_bound( + range.begin(), range.end(), name, [&](auto ix, std::string_view name) { + return internal::dir_entry_view_impl::name(ix, global_) < name; + }); - if (it != range.end()) { - if (internal::dir_entry_view_impl::name(*it, global_) == name) { - return dir_entry_view{dir_entry_view_impl::from_dir_entry_index_shared( - *it, global_.self_dir_entry(dir.inode()), global_)}; + if (it != range.end()) { + if (internal::dir_entry_view_impl::name(*it, global_) == name) { + return dir_entry_view{dir_entry_view_impl::from_dir_entry_index_shared( + *it, global_.self_dir_entry(dir.inode()), global_)}; + } } } diff --git a/test/dwarfs_test.cpp b/test/dwarfs_test.cpp index c1d8de6c1..08104b21d 100644 --- a/test/dwarfs_test.cpp +++ b/test/dwarfs_test.cpp @@ -2054,3 +2054,166 @@ TEST(filesystem, multi_image) { EXPECT_EQ("baz", fs.read_string(fs.open(baz->inode()))); } } + +TEST(filesystem, case_insensitive_lookup) { + auto input = std::make_shared(); + + input->add_dir(""); + input->add_dir(u8"hEllÖwÖrLD"); + input->add_dir(u8"FÜñKÿStrÍñg"); + input->add_dir(u8"unícødérøcks"); + input->add_dir(u8"JÄLAPEÑOPEPPÉR"); + input->add_dir(u8"SpIcYsÜsHiRoLL"); + input->add_dir(u8"CAFÉMØCHAlatte"); + input->add_dir(u8"ČhàŧGƤŦ"); + input->add_dir(u8"lõREMÏpSüM"); + input->add_dir(u8"ŠåmpŁËŠTrInG"); + input->add_dir(u8"pythonprogramming"); + input->add_dir(u8"DÃTâScïÊNcË"); + input->add_dir(u8"AIISFÛTÛRË"); + input->add_dir(u8"readability"); + input->add_file(u8"TëStCãSeSçÉNâRïÖ", "testcasescenario"); + input->add_file(u8"lõREMÏpSüM/ÆSTHETÎCcøding", "aestheticcoding"); + input->add_file(u8"lõREMÏpSüM/smîLëyFÀÇë😊", "smileyface"); + input->add_file(u8"lõREMÏpSüM/NØRTHèast", "northeast"); + input->add_file(u8"lõREMÏpSüM/SPACEadventure", "spaceadventure"); + input->add_file(u8"lõREMÏpSüM/cõMPLEXïTy🚀", "complexity"); + input->add_file(u8"lõREMÏpSüM/thisisatest", "thisisatest"); + + std::vector case_sensitive_dirs{ + u8"/hEllÖwÖrLD", u8"/FÜñKÿStrÍñg", u8"/unícødérøcks", + u8"/JÄLAPEÑOPEPPÉR", u8"/SpIcYsÜsHiRoLL", u8"/CAFÉMØCHAlatte", + u8"/ČhàŧGƤŦ", u8"/lõREMÏpSüM", u8"/ŠåmpŁËŠTrInG", + u8"/pythonprogramming", u8"/DÃTâScïÊNcË", u8"/AIISFÛTÛRË", + u8"/readability", + }; + + std::vector> case_sensitive_files{ + {u8"/TëStCãSeSçÉNâRïÖ", "testcasescenario"}, + {u8"/lõREMÏpSüM/ÆSTHETÎCcøding", "aestheticcoding"}, + {u8"/lõREMÏpSüM/smîLëyFÀÇë😊", "smileyface"}, + {u8"/lõREMÏpSüM/NØRTHèast", "northeast"}, + {u8"/lõREMÏpSüM/SPACEadventure", "spaceadventure"}, + {u8"/lõREMÏpSüM/cõMPLEXïTy🚀", "complexity"}, + {u8"/lõREMÏpSüM/thisisatest", "thisisatest"}, + }; + + std::vector case_insensitive_dirs{ + u8"/HELlÖwÖRLD", u8"/FÜÑKÿSTríÑg", u8"/uNÍcødéRøcks", + u8"/JÄLApeñOPePPÉR", u8"/SpiCysÜshiRoLL", u8"/CAféMØchAlatte", + u8"/čhàŧgƥŧ", u8"/lõremÏpsüM", u8"/šåmpŁëšTrInG", + u8"/pyTHonproGRamming", u8"/DãtÂScïêNcË", u8"/AiisFÛTÛRË", + u8"/reADabiLIty", + }; + + std::vector> case_insensitive_files{ + {u8"/TësTcãSeSçéNâRïÖ", "testcasescenario"}, + {u8"/lõRemïpSüM/ÆstHETÎCcØDing", "aestheticcoding"}, + {u8"/lõremïPSüM/smîlËYfàÇë😊", "smileyface"}, + {u8"/lõREMÏPsÜM/NØRthÈAst", "northeast"}, + {u8"/lõRemïPsüM/SPACEadvENTure", "spaceadventure"}, + {u8"/LÕREMÏpSüM/CõMPlexïTy🚀", "complexity"}, + {u8"/lõrEMÏpSüM/thiSISatest", "thisisatest"}, + }; + + std::vector non_matching_entries{ + u8"/HELlÖwÖRLDx", + u8"/FÜÑKÿSTríÑj", + u8"/uNÍcødéRcks", + u8"/JÄLApeñOPePPÉ", + u8"/SpiCysÜshiRoLLx", + u8"/CAféMØchAltte", + u8"/čhàŧgƥŧx", + u8"/lõremÏpsü", + u8"/šåmpŁëšTrnG", + u8"/pyTHonproGRammin", + u8"/DãtÂScïêNcËx", + u8"/AiisFÛTÛTË", + u8"/reADabiLItx", + u8"/TësRcãSeSçéNâRïÖ", + u8"/lõRemïpüM/ÆstHETÎCcØDing", + u8"/lõremïPSüM/mîlËYfàÇë😊", + u8"/lõRMÏPsÜM/NØRthÈAst", + u8"/lõRemïPsüM/SPACEadvENTurex", + u8"/LÕREMÏpSüM/CõMPexïTy🚀", + u8"/lõrEMÏpSüM/thiSISatesy", + }; + + test::test_logger lgr; + auto fsimage = build_dwarfs(lgr, input, "null"); + + auto mm = std::make_shared(std::move(fsimage)); + + { + reader::filesystem_v2 fs(lgr, *input, mm, + {.metadata = {.case_insensitive_lookup = false}}); + + for (auto const& dir : case_sensitive_dirs) { + auto name = u8string_to_string(dir); + auto dev = fs.find(name); + EXPECT_TRUE(dev) << name; + } + + for (auto const& [file, content] : case_sensitive_files) { + auto name = u8string_to_string(file); + auto dev = fs.find(name); + EXPECT_TRUE(dev) << name; + EXPECT_EQ(content, fs.read_string(fs.open(dev->inode()))) << name; + } + + for (auto const& dir : case_insensitive_dirs) { + auto name = u8string_to_string(dir); + auto dev = fs.find(name); + EXPECT_FALSE(dev) << name; + } + + for (auto const& [file, content] : case_insensitive_files) { + auto name = u8string_to_string(file); + auto dev = fs.find(name); + EXPECT_FALSE(dev) << name; + } + + for (auto const& ent : non_matching_entries) { + auto name = u8string_to_string(ent); + auto dev = fs.find(name); + EXPECT_FALSE(dev) << name; + } + } + + { + reader::filesystem_v2 fs(lgr, *input, mm, + {.metadata = {.case_insensitive_lookup = true}}); + + for (auto const& dir : case_sensitive_dirs) { + auto name = u8string_to_string(dir); + auto dev = fs.find(name); + EXPECT_TRUE(dev) << name; + } + + for (auto const& [file, content] : case_sensitive_files) { + auto name = u8string_to_string(file); + auto dev = fs.find(name); + EXPECT_TRUE(dev) << name; + EXPECT_EQ(content, fs.read_string(fs.open(dev->inode()))) << name; + } + + for (auto const& dir : case_insensitive_dirs) { + auto name = u8string_to_string(dir); + auto dev = fs.find(name); + EXPECT_TRUE(dev) << name; + } + + for (auto const& [file, content] : case_insensitive_files) { + auto name = u8string_to_string(file); + auto dev = fs.find(name); + EXPECT_TRUE(dev) << name; + EXPECT_EQ(content, fs.read_string(fs.open(dev->inode()))) << name; + } + + for (auto const& ent : non_matching_entries) { + auto name = u8string_to_string(ent); + auto dev = fs.find(name); + EXPECT_FALSE(dev) << name; + } + } +} diff --git a/vcpkg.json b/vcpkg.json index fbb2f7619..17079a6c3 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -20,6 +20,7 @@ "double-conversion", "fmt", "glog", + "icu", "libarchive", "libevent", "libflac",