From 7a9bd017754993708c8cb806f8eda6468685bde4 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Sun, 17 Dec 2023 20:36:07 +0100 Subject: [PATCH 001/571] chore: update folly/fbthrift --- fbthrift | 2 +- folly | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fbthrift b/fbthrift index 0d87d2068..1678fba07 160000 --- a/fbthrift +++ b/fbthrift @@ -1 +1 @@ -Subproject commit 0d87d2068b9df8ae5b7ab21048ffeb58ba91a9ca +Subproject commit 1678fba0715b526ff7f15571695f179be89eec71 diff --git a/folly b/folly index 3a0cd44fc..357c1c775 160000 --- a/folly +++ b/folly @@ -1 +1 @@ -Subproject commit 3a0cd44fc76e1453cc2565cbd4d2a41caf355eb4 +Subproject commit 357c1c77503fa6b9af7ce0afd630004809c6953f From 4e0d2ba25e91ba248839006b948bd49151a63aac Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Fri, 28 Jul 2023 14:33:42 +0200 Subject: [PATCH 002/571] Support features sets --- CMakeLists.txt | 1 + include/dwarfs/features.h | 31 ++++++++++++++++++++ src/dwarfs/features.cpp | 59 ++++++++++++++++++++++++++++++++++++++ src/dwarfs/metadata_v2.cpp | 18 +++++++++++- thrift/metadata.thrift | 13 +++++++++ 5 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 include/dwarfs/features.h create mode 100644 src/dwarfs/features.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 9b400a262..dfab44268 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -361,6 +361,7 @@ list( src/dwarfs/console_writer.cpp src/dwarfs/entry.cpp src/dwarfs/error.cpp + src/dwarfs/features.cpp src/dwarfs/file_scanner.cpp src/dwarfs/file_stat.cpp src/dwarfs/file_type.cpp diff --git a/include/dwarfs/features.h b/include/dwarfs/features.h new file mode 100644 index 000000000..83a787ee8 --- /dev/null +++ b/include/dwarfs/features.h @@ -0,0 +1,31 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include + +namespace dwarfs { + +std::set get_unsupported_features(std::set features); + +} // namespace dwarfs diff --git a/src/dwarfs/features.cpp b/src/dwarfs/features.cpp new file mode 100644 index 000000000..6a3091719 --- /dev/null +++ b/src/dwarfs/features.cpp @@ -0,0 +1,59 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include + +#include "dwarfs/features.h" + +namespace dwarfs { + +namespace { + +std::set supported_features{ +#ifdef DWARFS_HAVE_LIBZSTD + "zstd", +#endif +#ifdef DWARFS_HAVE_LIBLZ4 + "lz4", +#endif +#ifdef DWARFS_HAVE_LIBLZMA + "lzma", +#endif +#ifdef DWARFS_HAVE_LIBBROTLI + "brotli", +#endif +#ifdef DWARFS_HAVE_FLAC + "flac", +#endif +}; + +} // namespace + +std::set get_unsupported_features(std::set features) { + std::set rv; + std::set_difference(features.begin(), features.end(), + supported_features.begin(), supported_features.end(), + std::inserter(rv, rv.end())); + return rv; +} + +} // namespace dwarfs diff --git a/src/dwarfs/metadata_v2.cpp b/src/dwarfs/metadata_v2.cpp index 052eb594c..4faac1be1 100644 --- a/src/dwarfs/metadata_v2.cpp +++ b/src/dwarfs/metadata_v2.cpp @@ -44,6 +44,7 @@ #include #include "dwarfs/error.h" +#include "dwarfs/features.h" #include "dwarfs/file_stat.h" #include "dwarfs/fstypes.h" #include "dwarfs/logger.h" @@ -136,6 +137,20 @@ map_frozen(std::span schema, std::span data) { return ret; } +MappedFrozen +check_frozen(MappedFrozen meta) { + if (meta.features()) { + auto unsupported = get_unsupported_features(meta.features()->thaw()); + if (!unsupported.empty()) { + DWARFS_THROW(runtime_error, + fmt::format("file system uses the following features " + "unsupported by this build: {}", + boost::join(unsupported, ", "))); + } + } + return meta; +} + void analyze_frozen(std::ostream& os, MappedFrozen const& meta, size_t total_size, int detail) { @@ -302,7 +317,8 @@ class metadata_ final : public metadata_v2::impl { std::span data, metadata_options const& options, int inode_offset, bool force_consistency_check) : data_(data) - , meta_(map_frozen(schema, data_)) + , meta_( + check_frozen(map_frozen(schema, data_))) , global_(lgr, &meta_, options.check_consistency || force_consistency_check) , root_(dir_entry_view::from_dir_entry_index(0, &global_)) diff --git a/thrift/metadata.thrift b/thrift/metadata.thrift index 0872bcbc1..0ff3b2bbc 100644 --- a/thrift/metadata.thrift +++ b/thrift/metadata.thrift @@ -350,6 +350,19 @@ struct metadata { 25: optional string_table compact_symlinks + //=========================================================// + // fields added with dwarfs-0.7.0, file system version 2.5 // + //=========================================================// + // preferred path separator of original file system 26: optional UInt32 preferred_path_separator + + //=========================================================// + // fields added with dwarfs-0.7.3, file system version 2.5 // + //=========================================================// + + // The set of features used in this file system image. As long + // as an older binary supports all features, it will be able + // to use images created with newer versions. + 27: optional set features } From 1ac36bb6fa9bdbd86f0d910b9f62d8ff2d1d288d Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Mon, 17 Jul 2023 11:23:10 +0200 Subject: [PATCH 003/571] Initial categorizer implementation --- CMakeLists.txt | 29 +- include/dwarfs/categorizer.h | 197 +++++++++++++ include/dwarfs/file_category.h | 72 +++++ include/dwarfs/inode.h | 2 + include/dwarfs/inode_manager.h | 11 + include/dwarfs/options.h | 9 +- src/dwarfs/categorizer.cpp | 268 ++++++++++++++++++ src/dwarfs/categorizer/binary_categorizer.cpp | 129 +++++++++ .../incompressible_categorizer.cpp | 232 +++++++++++++++ .../categorizer/libmagic_categorizer.cpp | 198 +++++++++++++ src/dwarfs/inode_manager.cpp | 51 +++- src/dwarfs/scanner.cpp | 9 + src/mkdwarfs_main.cpp | 19 +- vcpkg.json | 1 + 14 files changed, 1219 insertions(+), 8 deletions(-) create mode 100644 include/dwarfs/categorizer.h create mode 100644 include/dwarfs/file_category.h create mode 100644 src/dwarfs/categorizer.cpp create mode 100644 src/dwarfs/categorizer/binary_categorizer.cpp create mode 100644 src/dwarfs/categorizer/incompressible_categorizer.cpp create mode 100644 src/dwarfs/categorizer/libmagic_categorizer.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index dfab44268..db10fbc6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -212,6 +212,7 @@ if(PKG_CONFIG_FOUND) pkg_check_modules(LIBBROTLIDEC IMPORTED_TARGET libbrotlidec>=1.0.9) pkg_check_modules(LIBBROTLIENC IMPORTED_TARGET libbrotlienc>=1.0.9) pkg_check_modules(LIBARCHIVE IMPORTED_TARGET libarchive>=3.6.0) + pkg_check_modules(LIBMAGIC IMPORTED_TARGET libmagic>=5.38) pkg_check_modules(ZSTD IMPORTED_TARGET libzstd>=1.5.2) pkg_check_modules(XXHASH IMPORTED_TARGET libxxhash>=0.8.1) endif() @@ -356,6 +357,7 @@ list( src/dwarfs/block_range.cpp src/dwarfs/builtin_script.cpp src/dwarfs/cached_block.cpp + src/dwarfs/categorizer.cpp src/dwarfs/checksum.cpp src/dwarfs/chmod_transformer.cpp src/dwarfs/console_writer.cpp @@ -413,8 +415,20 @@ if(LIBBROTLIDEC_FOUND AND LIBBROTLIENC_FOUND) list(APPEND LIBDWARFS_COMPRESSION_SRC src/dwarfs/compression/brotli.cpp) endif() +list( + APPEND + LIBDWARFS_CATEGORIZER_SRC + src/dwarfs/categorizer/binary_categorizer.cpp + src/dwarfs/categorizer/incompressible_categorizer.cpp +) + +if(LIBMAGIC_FOUND) + list(APPEND LIBDWARFS_CATEGORIZER_SRC src/dwarfs/categorizer/libmagic_categorizer.cpp) +endif() + add_library(dwarfs ${LIBDWARFS_SRC}) add_library(dwarfs_compression ${LIBDWARFS_COMPRESSION_SRC}) +add_library(dwarfs_categorizer ${LIBDWARFS_CATEGORIZER_SRC}) add_library(dwarfs_tool src/dwarfs/tool.cpp) if(DWARFS_GIT_BUILD) @@ -425,6 +439,7 @@ target_compile_definitions( dwarfs_tool PRIVATE PRJ_BUILD_ID="${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_SYSTEM}, ${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}" ) +target_link_libraries(dwarfs_categorizer folly) target_link_libraries(dwarfs_compression folly) target_link_libraries(dwarfs_tool dwarfs) @@ -719,7 +734,8 @@ target_include_directories(metadata_thrift PRIVATE ${INCLUDE_DIRS}) target_link_libraries(metadata_thrift thrift_light) -foreach(tgt dwarfs dwarfs_compression dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGETS}) +foreach(tgt dwarfs dwarfs_compression dwarfs_categorizer + dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGETS}) target_include_directories( ${tgt} SYSTEM PRIVATE ${Boost_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS} ${INCLUDE_DIRS} @@ -732,6 +748,7 @@ foreach(tgt dwarfs dwarfs_compression dwarfs_tool ${BINARY_TARGETS} ${MAIN_TARGE PRIVATE DWARFS_HAVE_LIBZSTD DWARFS_STATIC_BUILD=${STATIC_BUILD_DO_NOT_USE} $<$:DWARFS_USE_JEMALLOC> + $<$:DWARFS_HAVE_LIBMAGIC> $<$:DWARFS_HAVE_LIBLZ4> $<$:DWARFS_HAVE_LIBLZMA> $<$,$>:DWARFS_HAVE_LIBBROTLI> @@ -809,6 +826,10 @@ target_link_libraries( fsst ${Boost_LIBRARIES}) +if(LIBMAGIC_FOUND) + target_link_libraries(dwarfs PkgConfig::LIBMAGIC) +endif() + if(LIBLZ4_FOUND) target_link_libraries(dwarfs PkgConfig::LIBLZ4) endif() @@ -823,6 +844,7 @@ endif() if(NOT STATIC_BUILD_DO_NOT_USE) target_link_libraries(dwarfs PkgConfig::LIBARCHIVE) + target_link_libraries(dwarfs_categorizer PkgConfig::LIBMAGIC) endif(NOT STATIC_BUILD_DO_NOT_USE) if(ZSTD_FOUND AND PREFER_SYSTEM_ZSTD) @@ -850,6 +872,7 @@ foreach(tgt ${BINARY_TARGETS} ${MAIN_TARGETS}) endif() endforeach() +target_link_libraries(mkdwarfs_main "$") if(STATIC_BUILD_DO_NOT_USE) # ................................................................... @@ -883,6 +906,7 @@ if(STATIC_BUILD_DO_NOT_USE) import_static_lib(static_libssl "libssl.a") import_static_lib(static_libunwind "libunwind.a") import_static_lib(static_libarchive "libarchive.a") + import_static_lib(static_libmagic "libmagic.a") set_target_properties(static_libunwind PROPERTIES INTERFACE_LINK_LIBRARIES PkgConfig::LIBLZMA) @@ -890,7 +914,10 @@ if(STATIC_BUILD_DO_NOT_USE) static_libgflags) set_target_properties(static_librt PROPERTIES INTERFACE_LINK_LIBRARIES static_libgflags) + set_target_properties(static_libmagic PROPERTIES INTERFACE_LINK_LIBRARIES + static_libz) + target_link_libraries(dwarfs_categorizer static_libmagic) foreach(tgt ${BINARY_TARGETS}) if(PREFER_SYSTEM_LIBFMT) diff --git a/include/dwarfs/categorizer.h b/include/dwarfs/categorizer.h new file mode 100644 index 000000000..a8ce4a7c2 --- /dev/null +++ b/include/dwarfs/categorizer.h @@ -0,0 +1,197 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dwarfs/file_category.h" + +namespace boost::program_options { +class options_description; +class variables_map; +} // namespace boost::program_options + +namespace dwarfs { + +class logger; + +class categorizer { + public: + virtual ~categorizer() = default; + + virtual std::span categories() const = 0; +}; + +class random_access_categorizer : public categorizer { + public: + virtual std::optional + categorize(std::filesystem::path const& path, + std::span data) const = 0; +}; + +class sequential_categorizer_job { + public: + virtual ~sequential_categorizer_job() = default; + + virtual void add(std::span data) = 0; + virtual std::optional result() = 0; +}; + +class sequential_categorizer : public categorizer { + public: + virtual std::unique_ptr + job(std::filesystem::path const& path, size_t total_size) const = 0; +}; + +class categorizer_job { + public: + class impl; + + categorizer_job(); + categorizer_job(std::unique_ptr impl); + + void categorize_random_access(std::span data) { + return impl_->categorize_random_access(data); + } + + void categorize_sequential(std::span data) { + return impl_->categorize_sequential(data); + } + + file_category result() { return impl_->result(); } + + explicit operator bool() const { return impl_ != nullptr; } + + class impl { + public: + virtual ~impl() = default; + + virtual void categorize_random_access(std::span data) = 0; + virtual void categorize_sequential(std::span data) = 0; + virtual file_category result() = 0; + }; + + private: + std::unique_ptr impl_; +}; + +class categorizer_manager { + public: + categorizer_manager(logger& lgr); + + void add(std::shared_ptr c) { impl_->add(std::move(c)); } + + categorizer_job job(std::filesystem::path const& path) const { + return impl_->job(path); + } + + std::string_view category_name(file_category c) const { + return impl_->category_name(c); + } + + class impl { + public: + virtual ~impl() = default; + + virtual void add(std::shared_ptr c) = 0; + virtual categorizer_job job(std::filesystem::path const& path) const = 0; + virtual std::string_view category_name(file_category c) const = 0; + }; + + private: + std::unique_ptr impl_; +}; + +class categorizer_info { + public: + virtual ~categorizer_info() = default; + + virtual std::string_view name() const = 0; + virtual std::shared_ptr + options() const = 0; +}; + +class categorizer_factory : public categorizer_info { + public: + virtual std::unique_ptr + create(logger& lgr, + boost::program_options::variables_map const& vm) const = 0; +}; + +namespace detail { + +template +class categorizer_factory_registrar { + public: + categorizer_factory_registrar(); +}; + +} // namespace detail + +class categorizer_registry { + public: + template + friend class detail::categorizer_factory_registrar; + + static categorizer_registry& instance(); + + std::unique_ptr + create(logger& lgr, std::string const& name, + boost::program_options::variables_map const& vm) const; + + void add_options(boost::program_options::options_description& opts) const; + + std::vector categorizer_names() const; + + private: + categorizer_registry(); + ~categorizer_registry(); + + void register_factory(std::unique_ptr&& factory); + + std::map> factories_; +}; + +namespace detail { + +template +categorizer_factory_registrar::categorizer_factory_registrar() { + ::dwarfs::categorizer_registry::instance().register_factory( + std::make_unique()); +} + +} // namespace detail + +#define REGISTER_CATEGORIZER_FACTORY(factory) \ + namespace { \ + ::dwarfs::detail::categorizer_factory_registrar \ + the_##factory##_registrar; \ + } + +} // namespace dwarfs diff --git a/include/dwarfs/file_category.h b/include/dwarfs/file_category.h new file mode 100644 index 000000000..6b56aa2e2 --- /dev/null +++ b/include/dwarfs/file_category.h @@ -0,0 +1,72 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include + +namespace dwarfs { + +class file_category { + public: + using value_type = uint32_t; + + static constexpr value_type const uninitialized{ + std::numeric_limits::max()}; + static constexpr value_type const min{0}; + static constexpr value_type const max{std::numeric_limits::max() - + 1}; + + file_category() + : value_{uninitialized} {} + file_category(value_type v) + : value_{v} {} + + file_category(file_category const&) = default; + file_category(file_category&&) = default; + + file_category& operator=(file_category const&) = default; + file_category& operator=(file_category&&) = default; + + file_category& operator=(value_type v) { + value_ = v; + return *this; + } + + value_type value() const { + if (empty()) { + throw std::range_error("file_category is uninitialized"); + } + return value_; + } + + void clear() { value_ = uninitialized; } + + bool empty() const { return value_ == uninitialized; } + + explicit operator bool() const { return !empty(); } + + private: + value_type value_; +}; + +} // namespace dwarfs diff --git a/include/dwarfs/inode.h b/include/dwarfs/inode.h index dbca37e53..a8d086a5f 100644 --- a/include/dwarfs/inode.h +++ b/include/dwarfs/inode.h @@ -27,6 +27,7 @@ #include +#include "dwarfs/file_category.h" #include "dwarfs/nilsimsa.h" #include "dwarfs/object.h" @@ -58,6 +59,7 @@ class inode : public object { virtual void add_chunk(size_t block, size_t offset, size_t size) = 0; virtual void append_chunks_to(std::vector& vec) const = 0; + virtual file_category category() const = 0; }; } // namespace dwarfs diff --git a/include/dwarfs/inode_manager.h b/include/dwarfs/inode_manager.h index 386198bc7..71f64cc9c 100644 --- a/include/dwarfs/inode_manager.h +++ b/include/dwarfs/inode_manager.h @@ -24,6 +24,11 @@ #include #include #include +#include +#include +#include + +#include "dwarfs/file_category.h" namespace dwarfs { @@ -54,6 +59,10 @@ class inode_manager { impl_->for_each_inode_in_order(fn); } + std::vector> category_counts() const { + return impl_->category_counts(); + } + class impl { public: virtual ~impl() = default; @@ -65,6 +74,8 @@ class inode_manager { file_order_options const& file_order, order_cb const& fn) = 0; virtual void for_each_inode_in_order( std::function const&)> const& fn) const = 0; + virtual std::vector> + category_counts() const = 0; }; private: diff --git a/include/dwarfs/options.h b/include/dwarfs/options.h index c1edabc13..b4afa6896 100644 --- a/include/dwarfs/options.h +++ b/include/dwarfs/options.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "dwarfs/file_stat.h" @@ -32,6 +33,7 @@ namespace dwarfs { +class categorizer_manager; class entry; enum class mlock_mode { NONE, TRY, MUST }; @@ -78,11 +80,12 @@ struct inode_options { bool with_similarity{false}; bool with_nilsimsa{false}; std::optional max_similarity_scan_size; + std::shared_ptr categorizer_mgr; bool needs_scan(size_t size) const { - return (with_similarity || with_nilsimsa) && - (!max_similarity_scan_size || - size <= max_similarity_scan_size.value()); + return categorizer_mgr || ((with_similarity || with_nilsimsa) && + (!max_similarity_scan_size || + size <= max_similarity_scan_size.value())); } }; diff --git a/src/dwarfs/categorizer.cpp b/src/dwarfs/categorizer.cpp new file mode 100644 index 000000000..969bbeb02 --- /dev/null +++ b/src/dwarfs/categorizer.cpp @@ -0,0 +1,268 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include + +#include + +#include + +#include + +#include "dwarfs/categorizer.h" +#include "dwarfs/compiler.h" +#include "dwarfs/error.h" +#include "dwarfs/logger.h" + +namespace dwarfs { + +namespace po = boost::program_options; + +namespace { +constexpr std::string_view const DEFAULT_CATEGORY{""}; +} + +class categorizer_manager_private { + public: + virtual ~categorizer_manager_private() = default; + + virtual std::vector> const& + categorizers() const = 0; + virtual file_category category(std::string_view cat) const = 0; +}; + +template +class categorizer_job_ final : public categorizer_job::impl { + public: + categorizer_job_(logger& lgr, categorizer_manager_private const& mgr, + std::filesystem::path const& path) + : LOG_PROXY_INIT(lgr) + , mgr_{mgr} + , path_{path} {} + + void categorize_random_access(std::span data) override; + void categorize_sequential(std::span data) override; + file_category result() override; + + private: + LOG_PROXY_DECL(LoggerPolicy); + categorizer_manager_private const& mgr_; + + std::string_view best_{DEFAULT_CATEGORY}; + int index_{-1}; + bool is_global_best_{false}; + size_t total_size_hint_{0}; + std::vector>> + seq_jobs_; + std::filesystem::path const path_; +}; + +template +void categorizer_job_::categorize_random_access( + std::span data) { + DWARFS_CHECK(index_ < 0, + "internal error: index already set in categorize_random_access"); + + total_size_hint_ = data.size(); + + bool global_best = true; + + for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) { + if (auto p = dynamic_cast(cat.get())) { + if (auto c = p->categorize(path_, data)) { + best_ = *c; + index_ = index; + is_global_best_ = global_best; + break; + } + } else { + global_best = false; + } + } +} + +template +void categorizer_job_::categorize_sequential( + std::span data) { + if (is_global_best_) { + return; + } + + if (seq_jobs_.empty()) [[unlikely]] { + for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) { + if (index_ >= 0 && static_cast(index) >= index_) { + break; + } + + if (auto p = dynamic_cast(cat.get())) { + if (auto job = p->job(path_, total_size_hint_)) { + seq_jobs_.emplace_back(index, std::move(job)); + } + } + } + } + + for (auto&& [index, job] : seq_jobs_) { + job->add(data); + } +} + +template +file_category categorizer_job_::result() { + if (!seq_jobs_.empty()) { + for (auto&& [index, job] : seq_jobs_) { + if (auto c = job->result()) { + assert(index_ < 0 || index < index_); + best_ = *c; + break; + } + } + + seq_jobs_.clear(); + } + + LOG_TRACE << path_ << " -> " << best_; + + return mgr_.category(best_); +} + +categorizer_job::categorizer_job() = default; + +categorizer_job::categorizer_job(std::unique_ptr impl) + : impl_{std::move(impl)} {} + +template +class categorizer_manager_ final : public categorizer_manager::impl, + public categorizer_manager_private { + public: + categorizer_manager_(logger& lgr) + : lgr_{lgr} + , LOG_PROXY_INIT(lgr) { + add_category(DEFAULT_CATEGORY); + } + + void add(std::shared_ptr c) override; + categorizer_job job(std::filesystem::path const& path) const override; + std::string_view category_name(file_category c) const override; + + std::vector> const& + categorizers() const override { + return categorizers_; + } + + file_category category(std::string_view cat) const override { + auto it = catmap_.find(cat); + DWARFS_CHECK(it != catmap_.end(), fmt::format("unknown category: {}", cat)); + return it->second; + } + + private: + void add_category(std::string_view cat) { + if (catmap_.emplace(cat, categories_.size()).second) { + categories_.emplace_back(cat); + } else { + LOG_WARN << "duplicate category: " << cat; + } + } + + logger& lgr_; + LOG_PROXY_DECL(LoggerPolicy); + std::vector> categorizers_; + std::vector categories_; + std::unordered_map catmap_; +}; + +template +void categorizer_manager_::add( + std::shared_ptr c) { + for (auto const& c : c->categories()) { + add_category(c); + } + + categorizers_.emplace_back(std::move(c)); +} + +template +categorizer_job categorizer_manager_::job( + std::filesystem::path const& path) const { + return categorizer_job( + make_unique_logging_object(lgr_, *this, path)); +} + +template +std::string_view +categorizer_manager_::category_name(file_category c) const { + return DWARFS_NOTHROW(categories_.at(c.value())); +} + +categorizer_manager::categorizer_manager(logger& lgr) + : impl_(make_unique_logging_object(lgr)) {} + +categorizer_registry& categorizer_registry::instance() { + static categorizer_registry the_instance; + return the_instance; +} + +void categorizer_registry::register_factory( + std::unique_ptr&& factory) { + auto name = factory->name(); + + if (!factories_.emplace(name, std::move(factory)).second) { + std::cerr << "categorizer factory name conflict (" << name << "\n"; + ::abort(); + } +} + +std::unique_ptr +categorizer_registry::create(logger& lgr, std::string const& name, + po::variables_map const& vm) const { + auto it = factories_.find(name); + + if (it == factories_.end()) { + DWARFS_THROW(runtime_error, "unknown categorizer: " + name); + } + + return it->second->create(lgr, vm); +} + +void categorizer_registry::add_options(po::options_description& opts) const { + for (auto& f : factories_) { + if (auto f_opts = f.second->options()) { + opts.add(*f_opts); + } + } +} + +std::vector categorizer_registry::categorizer_names() const { + std::vector rv; + for (auto& f : factories_) { + rv.emplace_back(f.first); + } + return rv; +} + +categorizer_registry::categorizer_registry() = default; +categorizer_registry::~categorizer_registry() = default; + +} // namespace dwarfs diff --git a/src/dwarfs/categorizer/binary_categorizer.cpp b/src/dwarfs/categorizer/binary_categorizer.cpp new file mode 100644 index 000000000..c7c2dc589 --- /dev/null +++ b/src/dwarfs/categorizer/binary_categorizer.cpp @@ -0,0 +1,129 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include + +// TODO: this should obvs. work everywhere +#ifndef _WIN32 +#include +#endif + +#include "dwarfs/categorizer.h" +#include "dwarfs/error.h" +#include "dwarfs/logger.h" + +namespace dwarfs { + +namespace po = boost::program_options; + +namespace { + +constexpr std::string_view const SOME_CATEGORY{"bla"}; + +class binary_categorizer_base : public random_access_categorizer { + public: + std::span categories() const override; +}; + +template +class binary_categorizer_ final : public binary_categorizer_base { + public: + binary_categorizer_(logger& lgr) + : LOG_PROXY_INIT(lgr) {} + + std::optional + categorize(std::filesystem::path const& path, + std::span data) const override; + + private: + LOG_PROXY_DECL(LoggerPolicy); +}; + +std::span binary_categorizer_base::categories() const { + static constexpr std::array const s_categories{ + SOME_CATEGORY, + }; + return s_categories; +} + +template +std::optional +binary_categorizer_::categorize(std::filesystem::path const&, + std::span data + [[maybe_unused]]) const { +#ifndef _WIN32 + auto p = data.data(); + if (data.size() >= EI_NIDENT && ::memcmp(p, ELFMAG, 4) == 0) { + switch (p[EI_OSABI]) { + case ELFOSABI_SYSV: // 0 /* UNIX System V ABI */ + case ELFOSABI_HPUX: // 1 /* HP-UX */ + case ELFOSABI_NETBSD: // 2 /* NetBSD. */ + case ELFOSABI_GNU: // 3 /* Object uses GNU ELF extensions. */ + case ELFOSABI_SOLARIS: // 6 /* Sun Solaris. */ + case ELFOSABI_AIX: // 7 /* IBM AIX. */ + case ELFOSABI_IRIX: // 8 /* SGI Irix. */ + case ELFOSABI_FREEBSD: // 9 /* FreeBSD. */ + case ELFOSABI_TRU64: // 10 /* Compaq TRU64 UNIX. */ + case ELFOSABI_MODESTO: // 11 /* Novell Modesto. */ + case ELFOSABI_OPENBSD: // 12 /* OpenBSD. */ + case ELFOSABI_ARM_AEABI: // 64 /* ARM EABI */ + case ELFOSABI_ARM: // 97 /* ARM */ + case ELFOSABI_STANDALONE: // 255 /* Standalone (embedded) application */ + break; + } + } +#endif + + return std::nullopt; +} + +class binary_categorizer_factory : public categorizer_factory { + public: + std::string_view name() const override { return "binary"; } + + std::shared_ptr + options() const override { + return nullptr; + } + + std::unique_ptr + create(logger& lgr, po::variables_map const& /*vm*/) const override { + return make_unique_logging_object(lgr); + } + + private: +}; + +} // namespace + +REGISTER_CATEGORIZER_FACTORY(binary_categorizer_factory) + +} // namespace dwarfs diff --git a/src/dwarfs/categorizer/incompressible_categorizer.cpp b/src/dwarfs/categorizer/incompressible_categorizer.cpp new file mode 100644 index 000000000..665b4cf61 --- /dev/null +++ b/src/dwarfs/categorizer/incompressible_categorizer.cpp @@ -0,0 +1,232 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include +#include +#include + +#include + +#include + +#include + +#include "dwarfs/categorizer.h" +#include "dwarfs/error.h" +#include "dwarfs/logger.h" + +namespace dwarfs { + +namespace po = boost::program_options; + +namespace { + +constexpr std::string_view const INCOMPRESSIBLE_CATEGORY{"incompressible"}; + +struct incompressible_categorizer_config { + size_t min_input_size; + double max_ratio_size; + double max_ratio_blocks; + int lz4_acceleration; +}; + +template +class incompressible_categorizer_job_ : public sequential_categorizer_job { + public: + static constexpr size_t const block_size{1024 * 1024}; + + incompressible_categorizer_job_(logger& lgr, + incompressible_categorizer_config const& cfg, + std::filesystem::path const& path, + size_t total_size) + : LOG_PROXY_INIT(lgr) + , cfg_{cfg} + , path_{path} { + input_.reserve(total_size < block_size ? total_size : block_size); + state_ = ::malloc(LZ4_sizeofState()); + } + + ~incompressible_categorizer_job_() { ::free(state_); } + + void add(std::span data) override { + while (!data.empty()) { + auto part_size = input_.size() + data.size() <= block_size + ? data.size() + : block_size - input_.size(); + add_input(data.first(part_size)); + data = data.subspan(part_size); + } + } + + std::optional result() override { + if (!input_.empty()) { + compress(); + } + LOG_TRACE << path_ << " -> blocks: " << incompressible_blocks_ << "/" + << total_blocks_ << ", total compression ratio: " + << fmt::format("{:.2f}%", + 100.0 * total_output_size_ / total_input_size_); + if (total_blocks_ > 0 && + (total_output_size_ >= cfg_.max_ratio_size * total_input_size_ || + incompressible_blocks_ >= cfg_.max_ratio_blocks * total_blocks_)) { + return INCOMPRESSIBLE_CATEGORY; + } + return std::nullopt; + } + + private: + void add_input(std::span data) { + auto current_size = input_.size(); + assert(current_size + data.size() <= block_size); + input_.resize(current_size + data.size()); + ::memcpy(&input_[current_size], data.data(), data.size()); + if (input_.size() == block_size) { + compress(); + } + } + + void compress() { + total_input_size_ += input_.size(); + + output_.resize(::LZ4_compressBound(input_.size())); + + auto rv = ::LZ4_compress_fast_extState( + state_, reinterpret_cast(input_.data()), + reinterpret_cast(output_.data()), input_.size(), output_.size(), + cfg_.lz4_acceleration); + + if (rv == 0) { + DWARFS_THROW(runtime_error, + "unexpected error in LZ4_compress_fast_extState"); + } + + total_output_size_ += rv; + ++total_blocks_; + + if (rv >= static_cast(cfg_.max_ratio_size * input_.size())) { + ++incompressible_blocks_; + } + + input_.clear(); + } + + LOG_PROXY_DECL(LoggerPolicy); + void* state_; + std::vector input_; + std::vector output_; + size_t total_input_size_{0}; + size_t total_output_size_{0}; + size_t total_blocks_{0}; + size_t incompressible_blocks_{0}; + incompressible_categorizer_config const& cfg_; + std::filesystem::path const& path_; +}; + +class incompressible_categorizer_ final : public sequential_categorizer { + public: + incompressible_categorizer_(logger& lgr, + incompressible_categorizer_config const& cfg); + + std::span categories() const override; + std::unique_ptr + job(std::filesystem::path const& path, size_t total_size) const override; + + private: + logger& lgr_; + incompressible_categorizer_config const config_; +}; + +incompressible_categorizer_::incompressible_categorizer_( + logger& lgr, incompressible_categorizer_config const& cfg) + : lgr_{lgr} + , config_{cfg} {} + +std::span +incompressible_categorizer_::categories() const { + static constexpr std::array const s_categories{ + INCOMPRESSIBLE_CATEGORY, + }; + return s_categories; +} + +std::unique_ptr +incompressible_categorizer_::job(std::filesystem::path const& path, + size_t total_size) const { + if (total_size < config_.min_input_size) { + return nullptr; + } + + return make_unique_logging_object(lgr_, config_, path, + total_size); +} + +class incompressible_categorizer_factory : public categorizer_factory { + public: + incompressible_categorizer_factory() + : opts_{std::make_shared( + "Incompressible categorizer options")} { + static constexpr double const default_ratio{0.99}; + auto const default_ratio_str{fmt::format("{:.2f}", default_ratio)}; + // clang-format off + opts_->add_options() + ("incompressible-min-input-size", + po::value(&cfg_.min_input_size)->default_value(256), + "minimum file size in bytes to check for incompressibility") + ("incompressible-max-size-ratio", + po::value(&cfg_.max_ratio_size) + ->default_value(default_ratio, default_ratio_str), + "LZ4 compression ratio above files are considered incompressible") + ("incompressible-max-blocks-ratio", + po::value(&cfg_.max_ratio_blocks) + ->default_value(default_ratio, default_ratio_str), + "ratio of incompressible LZ4 blocks above which the whole file" + " is considered incompressible") + ("incompressible-lz4-acceleration (1..65537)", + po::value(&cfg_.lz4_acceleration)->default_value(1), + "LZ4 acceleration value") + ; + // clang-format on + } + + std::string_view name() const override { return "incompressible"; } + + std::shared_ptr options() const override { + return opts_; + } + + std::unique_ptr + create(logger& lgr, po::variables_map const& /*vm*/) const override { + return std::make_unique(lgr, cfg_); + } + + private: + incompressible_categorizer_config cfg_; + std::shared_ptr opts_; +}; + +} // namespace + +REGISTER_CATEGORIZER_FACTORY(incompressible_categorizer_factory) + +} // namespace dwarfs diff --git a/src/dwarfs/categorizer/libmagic_categorizer.cpp b/src/dwarfs/categorizer/libmagic_categorizer.cpp new file mode 100644 index 000000000..c26b961c6 --- /dev/null +++ b/src/dwarfs/categorizer/libmagic_categorizer.cpp @@ -0,0 +1,198 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include + +#include "dwarfs/categorizer.h" +#include "dwarfs/error.h" +#include "dwarfs/logger.h" + +namespace dwarfs { + +namespace { + +namespace po = boost::program_options; + +constexpr std::string_view const SOME_CATEGORY{"bla"}; + +std::unordered_set executable_mime_types{ + "application/x-executable", + "application/x-sharedlib", +}; + +class magic_wrapper { + public: + magic_wrapper() = default; + + size_t cookie_count() const { + auto rlock = cookies_.rlock(); + return rlock->size(); + } + + std::string identify(std::span data) const { + std::string rv; + scoped_cookie m(*this); + if (auto id = ::magic_buffer(m.get(), data.data(), data.size())) { + rv.assign(id); + } + if (rv.starts_with("application/")) { + ::magic_setflags(m.get(), MAGIC_NONE); + if (auto id = ::magic_buffer(m.get(), data.data(), data.size())) { + rv += "; " + std::string(id); + } + ::magic_setflags(m.get(), MAGIC_MIME_TYPE); + } + return rv; + } + + private: + using magic_cookie_t = + std::unique_ptr; + + magic_cookie_t new_cookie() const { + magic_cookie_t m(::magic_open(MAGIC_MIME_TYPE), &::magic_close); + if (!m) { + throw std::runtime_error("could not create magic cookie"); + } + if (::magic_load(m.get(), NULL) != 0) { + throw std::runtime_error( + fmt::format("(magic) {}", ::magic_error(m.get()))); + } + return m; + } + + class scoped_cookie { + public: + scoped_cookie(magic_wrapper const& w) + : cookie_{get_scoped_cookie(w)} + , w_{w} {} + + ~scoped_cookie() { + auto wlock = w_.cookies_.wlock(); + wlock->push(std::move(cookie_)); + } + + ::magic_t get() const { return cookie_.get(); } + + private: + static magic_cookie_t get_scoped_cookie(magic_wrapper const& w) { + auto wlock = w.cookies_.wlock(); + if (wlock->empty()) [[unlikely]] { + return w.new_cookie(); + } + auto cookie = std::move(wlock->top()); + wlock->pop(); + return cookie; + } + + magic_cookie_t cookie_; + magic_wrapper const& w_; + }; + + mutable folly::Synchronized> cookies_; +}; + +class libmagic_categorizer_base : public random_access_categorizer { + public: + std::span categories() const override; +}; + +template +class libmagic_categorizer_ final : public libmagic_categorizer_base { + public: + explicit libmagic_categorizer_(logger& lgr) + : LOG_PROXY_INIT(lgr) {} + + ~libmagic_categorizer_() { + LOG_INFO << m_.cookie_count() << " magic cookies were used"; + { + auto rlock = mimetypes_.rlock(); + for (auto const& [k, v] : *rlock) { + LOG_INFO << k << " -> " << v; + } + } + } + + std::optional + categorize(std::filesystem::path const& path, + std::span data) const override; + + private: + LOG_PROXY_DECL(LoggerPolicy); + magic_wrapper m_; + mutable folly::Synchronized> mimetypes_; +}; + +std::span +libmagic_categorizer_base::categories() const { + static constexpr std::array const s_categories{ + SOME_CATEGORY, + }; + return s_categories; +} + +template +std::optional libmagic_categorizer_::categorize( + std::filesystem::path const& path, std::span data) const { + auto id = m_.identify(data); + LOG_DEBUG << path << " -> (magic) " << id; + { + auto wlock = mimetypes_.wlock(); + ++(*wlock)[id]; + } + return std::nullopt; +} + +class libmagic_categorizer_factory : public categorizer_factory { + public: + std::string_view name() const override { return "libmagic"; } + + std::shared_ptr + options() const override { + return nullptr; + } + + std::unique_ptr + create(logger& lgr, po::variables_map const& /*vm*/) const override { + return make_unique_logging_object(lgr); + } + + private: +}; + +} // namespace + +REGISTER_CATEGORIZER_FACTORY(libmagic_categorizer_factory) + +} // namespace dwarfs diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index cfdd50248..3d05bce0e 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -29,10 +29,12 @@ #include #include #include +#include #include #include +#include "dwarfs/categorizer.h" #include "dwarfs/compiler.h" #include "dwarfs/entry.h" #include "dwarfs/error.h" @@ -144,8 +146,19 @@ class inode_ : public inode { similarity sc; nilsimsa nc; + categorizer_job catjob; + + if (opts.categorizer_mgr) { + catjob = + opts.categorizer_mgr->job(mm ? mm->path().string() : ""); + } + if (mm) { - auto update_hashes = [&](uint8_t const* data, size_t size) { + if (catjob) { + catjob.categorize_random_access(mm->span()); + } + + auto scan_sequential = [&](uint8_t const* data, size_t size) { if (opts.with_similarity) { sc.update(data, size); } @@ -153,6 +166,10 @@ class inode_ : public inode { if (opts.with_nilsimsa) { nc.update(data, size); } + + if (catjob) { + catjob.categorize_sequential(std::span(data, size)); + } }; constexpr size_t chunk_size = 32 << 20; @@ -160,13 +177,13 @@ class inode_ : public inode { size_t size = mm->size(); while (size >= chunk_size) { - update_hashes(mm->as(offset), chunk_size); + scan_sequential(mm->as(offset), chunk_size); mm->release_until(offset); offset += chunk_size; size -= chunk_size; } - update_hashes(mm->as(offset), size); + scan_sequential(mm->as(offset), size); } if (opts.with_similarity) { @@ -182,6 +199,10 @@ class inode_ : public inode { nilsimsa_valid_ = true; #endif } + + if (catjob) { + category_ = catjob.result(); + } } void add_chunk(size_t block, size_t offset, size_t size) override { @@ -207,9 +228,12 @@ class inode_ : public inode { vec.insert(vec.end(), chunks_.begin(), chunks_.end()); } + file_category category() const override { return category_; } + private: std::optional num_; uint32_t similarity_hash_{0}; + file_category category_; files_vector files_; std::vector chunks_; nilsimsa::hash_type nilsimsa_similarity_hash_; @@ -254,6 +278,27 @@ class inode_manager_ final : public inode_manager::impl { } } + std::vector> + category_counts() const override { + std::unordered_map tmp; + + for (auto const& i : inodes_) { + ++tmp[i->category().value()]; + } + + std::vector> rv; + + for (auto const& [k, v] : tmp) { + rv.emplace_back(k, v); + } + + std::sort(rv.begin(), rv.end(), [](auto const& a, auto const& b) { + return a.first.value() < b.first.value(); + }); + + return rv; + } + private: void order_inodes_by_path() { std::vector paths; diff --git a/src/dwarfs/scanner.cpp b/src/dwarfs/scanner.cpp index c3e2b96d6..9c3fcab77 100644 --- a/src/dwarfs/scanner.cpp +++ b/src/dwarfs/scanner.cpp @@ -37,6 +37,7 @@ #include #include "dwarfs/block_data.h" +#include "dwarfs/categorizer.h" #include "dwarfs/entry.h" #include "dwarfs/error.h" #include "dwarfs/file_scanner.h" @@ -606,6 +607,14 @@ void scanner_::scan( << prog.duplicate_files << "/" << prog.files_found << " duplicate files"; + if (options_.inode.categorizer_mgr) { + for (auto const& cc : im.category_counts()) { + LOG_INFO << cc.second << " " + << options_.inode.categorizer_mgr->category_name(cc.first) + << " files"; + } + } + global_entry_data ge_data(options_); thrift::metadata::metadata mv2; diff --git a/src/mkdwarfs_main.cpp b/src/mkdwarfs_main.cpp index 4b0f83640..e873e73bf 100644 --- a/src/mkdwarfs_main.cpp +++ b/src/mkdwarfs_main.cpp @@ -53,6 +53,7 @@ #include "dwarfs/block_compressor.h" #include "dwarfs/block_manager.h" #include "dwarfs/builtin_script.h" +#include "dwarfs/categorizer.h" #include "dwarfs/chmod_transformer.h" #include "dwarfs/console_writer.h" #include "dwarfs/entry.h" @@ -318,7 +319,7 @@ int mkdwarfs_main(int argc, sys_char** argv) { std::vector filter; size_t num_workers, num_scanner_workers; bool no_progress = false, remove_header = false, no_section_index = false, - force_overwrite = false; + force_overwrite = false, enable_categorizer = false; unsigned level; int compress_niceness; uint16_t uid, gid; @@ -391,6 +392,9 @@ int mkdwarfs_main(int argc, sys_char** argv) { ("recompress", po::value(&recompress_opts)->implicit_value("all"), "recompress an existing filesystem (none, block, metadata, all)") + ("categorize", + po::value(&enable_categorizer)->zero_tokens(), + "WIP enable categorizer") ("order", po::value(&order), order_desc.c_str()) @@ -510,6 +514,9 @@ int mkdwarfs_main(int argc, sys_char** argv) { .add(filesystem_opts) .add(metadata_opts); + auto& catreg = categorizer_registry::instance(); + catreg.add_options(opts); + po::variables_map vm; auto& sys_err_out = SYS_CERR; @@ -1021,6 +1028,14 @@ int mkdwarfs_main(int argc, sys_char** argv) { options.file_order.mode == file_order_mode::SIMILARITY; options.inode.with_nilsimsa = options.file_order.mode == file_order_mode::NILSIMSA; + if (enable_categorizer) { + options.inode.categorizer_mgr = + std::make_shared(lgr); + // TODO + for (auto const& name : catreg.categorizer_names()) { + options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm)); + } + } scanner s(lgr, wg_scanner, cfg, entry_factory::create(), std::make_shared(), std::move(script), @@ -1031,6 +1046,8 @@ int mkdwarfs_main(int argc, sys_char** argv) { } else { s.scan(fsw, path, prog); } + + options.inode.categorizer_mgr.reset(); } } catch (runtime_error const& e) { LOG_ERROR << e.what(); diff --git a/vcpkg.json b/vcpkg.json index b02702746..fca846b01 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -17,6 +17,7 @@ "glog", "libarchive", "libevent", + "libmagic", "openssl", "pkgconf", "utfcpp", From 33b2dfb95f5627cf398179e6e39f32baa40e7ee5 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 18 Jul 2023 12:29:13 +0200 Subject: [PATCH 004/571] Switch categorizers to returning inode fragments --- CMakeLists.txt | 1 + include/dwarfs/categorizer.h | 34 +++++--- .../{file_category.h => fragment_category.h} | 52 ++++++++---- include/dwarfs/inode.h | 4 +- include/dwarfs/inode_fragments.h | 83 +++++++++++++++++++ include/dwarfs/inode_manager.h | 7 +- src/dwarfs/categorizer.cpp | 61 ++++++++------ src/dwarfs/categorizer/binary_categorizer.cpp | 18 ++-- .../incompressible_categorizer.cpp | 24 ++++-- .../categorizer/libmagic_categorizer.cpp | 14 ++-- src/dwarfs/inode_fragments.cpp | 74 +++++++++++++++++ src/dwarfs/inode_manager.cpp | 26 +++--- 12 files changed, 312 insertions(+), 86 deletions(-) rename include/dwarfs/{file_category.h => fragment_category.h} (56%) create mode 100644 include/dwarfs/inode_fragments.h create mode 100644 src/dwarfs/inode_fragments.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index db10fbc6f..2e4567d76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -373,6 +373,7 @@ list( src/dwarfs/fstypes.cpp src/dwarfs/fs_section.cpp src/dwarfs/global_entry_data.cpp + src/dwarfs/inode_fragments.cpp src/dwarfs/inode_manager.cpp src/dwarfs/inode_reader_v2.cpp src/dwarfs/logger.cpp diff --git a/include/dwarfs/categorizer.h b/include/dwarfs/categorizer.h index a8ce4a7c2..e34405580 100644 --- a/include/dwarfs/categorizer.h +++ b/include/dwarfs/categorizer.h @@ -23,14 +23,14 @@ #include #include +#include #include #include #include -#include #include #include -#include "dwarfs/file_category.h" +#include "dwarfs/inode_fragments.h" namespace boost::program_options { class options_description; @@ -41,6 +41,9 @@ namespace dwarfs { class logger; +using category_mapper = + std::function; + class categorizer { public: virtual ~categorizer() = default; @@ -50,23 +53,26 @@ class categorizer { class random_access_categorizer : public categorizer { public: - virtual std::optional - categorize(std::filesystem::path const& path, - std::span data) const = 0; + virtual inode_fragments + categorize(std::filesystem::path const& path, std::span data, + category_mapper const& mapper) const = 0; }; +// TODO: add call to check if categorizer can return multiple fragments +// if it *can* we must run it before we start similarity hashing class sequential_categorizer_job { public: virtual ~sequential_categorizer_job() = default; virtual void add(std::span data) = 0; - virtual std::optional result() = 0; + virtual inode_fragments result() = 0; }; class sequential_categorizer : public categorizer { public: virtual std::unique_ptr - job(std::filesystem::path const& path, size_t total_size) const = 0; + job(std::filesystem::path const& path, size_t total_size, + category_mapper const& mapper) const = 0; }; class categorizer_job { @@ -76,6 +82,10 @@ class categorizer_job { categorizer_job(); categorizer_job(std::unique_ptr impl); + void set_total_size(size_t total_size) { + return impl_->set_total_size(total_size); + } + void categorize_random_access(std::span data) { return impl_->categorize_random_access(data); } @@ -84,7 +94,7 @@ class categorizer_job { return impl_->categorize_sequential(data); } - file_category result() { return impl_->result(); } + inode_fragments result() { return impl_->result(); } explicit operator bool() const { return impl_ != nullptr; } @@ -92,9 +102,10 @@ class categorizer_job { public: virtual ~impl() = default; + virtual void set_total_size(size_t total_size) = 0; virtual void categorize_random_access(std::span data) = 0; virtual void categorize_sequential(std::span data) = 0; - virtual file_category result() = 0; + virtual inode_fragments result() = 0; }; private: @@ -111,7 +122,7 @@ class categorizer_manager { return impl_->job(path); } - std::string_view category_name(file_category c) const { + std::string_view category_name(fragment_category::value_type c) const { return impl_->category_name(c); } @@ -121,7 +132,8 @@ class categorizer_manager { virtual void add(std::shared_ptr c) = 0; virtual categorizer_job job(std::filesystem::path const& path) const = 0; - virtual std::string_view category_name(file_category c) const = 0; + virtual std::string_view + category_name(fragment_category::value_type c) const = 0; }; private: diff --git a/include/dwarfs/file_category.h b/include/dwarfs/fragment_category.h similarity index 56% rename from include/dwarfs/file_category.h rename to include/dwarfs/fragment_category.h index 6b56aa2e2..4bc307ee2 100644 --- a/include/dwarfs/file_category.h +++ b/include/dwarfs/fragment_category.h @@ -21,12 +21,13 @@ #pragma once +#include #include #include namespace dwarfs { -class file_category { +class fragment_category { public: using value_type = uint32_t; @@ -36,37 +37,60 @@ class file_category { static constexpr value_type const max{std::numeric_limits::max() - 1}; - file_category() - : value_{uninitialized} {} - file_category(value_type v) + fragment_category() = default; + + explicit fragment_category(value_type v) : value_{v} {} - file_category(file_category const&) = default; - file_category(file_category&&) = default; + fragment_category(value_type v, value_type subcategory) + : value_{v} + , subcategory_{subcategory} {} + + fragment_category(fragment_category const&) = default; + fragment_category(fragment_category&&) = default; - file_category& operator=(file_category const&) = default; - file_category& operator=(file_category&&) = default; + fragment_category& operator=(fragment_category const&) = default; + fragment_category& operator=(fragment_category&&) = default; - file_category& operator=(value_type v) { + fragment_category& operator=(value_type v) { + assert(v != uninitialized); value_ = v; return *this; } value_type value() const { - if (empty()) { - throw std::range_error("file_category is uninitialized"); - } + assert(!empty()); return value_; } - void clear() { value_ = uninitialized; } + void clear() { + value_ = uninitialized; + subcategory_ = uninitialized; + } bool empty() const { return value_ == uninitialized; } explicit operator bool() const { return !empty(); } + void set_subcategory(value_type subcategory) { + assert(!empty()); + assert(subcategory != uninitialized); + subcategory_ = subcategory; + } + + bool has_subcategory() const { + return !empty() && subcategory_ != uninitialized; + } + + value_type subcategory() const { + assert(!empty()); + assert(subcategory_ != uninitialized); + return subcategory_; + } + private: - value_type value_; + value_type value_{uninitialized}; + value_type subcategory_{uninitialized}; }; } // namespace dwarfs diff --git a/include/dwarfs/inode.h b/include/dwarfs/inode.h index a8d086a5f..703e113cc 100644 --- a/include/dwarfs/inode.h +++ b/include/dwarfs/inode.h @@ -27,7 +27,7 @@ #include -#include "dwarfs/file_category.h" +#include "dwarfs/inode_fragments.h" #include "dwarfs/nilsimsa.h" #include "dwarfs/object.h" @@ -59,7 +59,7 @@ class inode : public object { virtual void add_chunk(size_t block, size_t offset, size_t size) = 0; virtual void append_chunks_to(std::vector& vec) const = 0; - virtual file_category category() const = 0; + virtual inode_fragments const& fragments() const = 0; }; } // namespace dwarfs diff --git a/include/dwarfs/inode_fragments.h b/include/dwarfs/inode_fragments.h new file mode 100644 index 000000000..319132271 --- /dev/null +++ b/include/dwarfs/inode_fragments.h @@ -0,0 +1,83 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include +#include +#include + +#include + +#include "dwarfs/fragment_category.h" +#include "dwarfs/types.h" + +namespace dwarfs { + +class single_inode_fragment { + public: + single_inode_fragment(fragment_category category, file_off_t length) + : category_{category} + , length_{length} {} + + fragment_category category() const { return category_; } + file_off_t length() const { return length_; } + file_off_t size() const { return length_; } + + private: + fragment_category category_; + file_off_t length_; +}; + +class inode_fragments { + public: + using mapper_function_type = + std::function; + + inode_fragments() = default; + + single_inode_fragment& + emplace_back(fragment_category category, file_off_t length) { + return fragments_.emplace_back(category, length); + } + + std::span span() const { return fragments_; } + + bool empty() const { return fragments_.empty(); } + + explicit operator bool() const { return !empty(); } + + std::ostream& + to_stream(std::ostream& os, + mapper_function_type const& mapper = mapper_function_type()) const; + std::string + to_string(mapper_function_type const& mapper = mapper_function_type()) const; + + private: + folly::small_vector fragments_; +}; + +inline std::ostream& operator<<(std::ostream& os, inode_fragments const& frag) { + return frag.to_stream(os); +} + +} // namespace dwarfs diff --git a/include/dwarfs/inode_manager.h b/include/dwarfs/inode_manager.h index 71f64cc9c..1678fb53c 100644 --- a/include/dwarfs/inode_manager.h +++ b/include/dwarfs/inode_manager.h @@ -28,7 +28,7 @@ #include #include -#include "dwarfs/file_category.h" +#include "dwarfs/fragment_category.h" namespace dwarfs { @@ -59,7 +59,8 @@ class inode_manager { impl_->for_each_inode_in_order(fn); } - std::vector> category_counts() const { + std::vector> + category_counts() const { return impl_->category_counts(); } @@ -74,7 +75,7 @@ class inode_manager { file_order_options const& file_order, order_cb const& fn) = 0; virtual void for_each_inode_in_order( std::function const&)> const& fn) const = 0; - virtual std::vector> + virtual std::vector> category_counts() const = 0; }; diff --git a/src/dwarfs/categorizer.cpp b/src/dwarfs/categorizer.cpp index 969bbeb02..067100c07 100644 --- a/src/dwarfs/categorizer.cpp +++ b/src/dwarfs/categorizer.cpp @@ -35,19 +35,20 @@ namespace dwarfs { +using namespace std::placeholders; + namespace po = boost::program_options; namespace { constexpr std::string_view const DEFAULT_CATEGORY{""}; } -class categorizer_manager_private { +class categorizer_manager_private : public categorizer_manager::impl { public: - virtual ~categorizer_manager_private() = default; - virtual std::vector> const& categorizers() const = 0; - virtual file_category category(std::string_view cat) const = 0; + virtual fragment_category::value_type + category(std::string_view cat) const = 0; }; template @@ -57,39 +58,48 @@ class categorizer_job_ final : public categorizer_job::impl { std::filesystem::path const& path) : LOG_PROXY_INIT(lgr) , mgr_{mgr} - , path_{path} {} + , path_{path} + , cat_mapper_{std::bind(&categorizer_manager_private::category, + std::cref(mgr_), _1)} {} + void set_total_size(size_t total_size) override; void categorize_random_access(std::span data) override; void categorize_sequential(std::span data) override; - file_category result() override; + inode_fragments result() override; private: LOG_PROXY_DECL(LoggerPolicy); categorizer_manager_private const& mgr_; - std::string_view best_{DEFAULT_CATEGORY}; + inode_fragments best_; int index_{-1}; bool is_global_best_{false}; - size_t total_size_hint_{0}; + size_t total_size_{0}; std::vector>> seq_jobs_; std::filesystem::path const path_; + category_mapper cat_mapper_; }; +template +void categorizer_job_::set_total_size(size_t total_size) { + total_size_ = total_size; +} + template void categorizer_job_::categorize_random_access( std::span data) { DWARFS_CHECK(index_ < 0, "internal error: index already set in categorize_random_access"); - total_size_hint_ = data.size(); + total_size_ = data.size(); bool global_best = true; for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) { if (auto p = dynamic_cast(cat.get())) { - if (auto c = p->categorize(path_, data)) { - best_ = *c; + if (auto c = p->categorize(path_, data, cat_mapper_)) { + best_ = c; index_ = index; is_global_best_ = global_best; break; @@ -114,7 +124,7 @@ void categorizer_job_::categorize_sequential( } if (auto p = dynamic_cast(cat.get())) { - if (auto job = p->job(path_, total_size_hint_)) { + if (auto job = p->job(path_, total_size_, cat_mapper_)) { seq_jobs_.emplace_back(index, std::move(job)); } } @@ -127,12 +137,12 @@ void categorizer_job_::categorize_sequential( } template -file_category categorizer_job_::result() { +inode_fragments categorizer_job_::result() { if (!seq_jobs_.empty()) { for (auto&& [index, job] : seq_jobs_) { if (auto c = job->result()) { assert(index_ < 0 || index < index_); - best_ = *c; + best_ = c; break; } } @@ -140,9 +150,12 @@ file_category categorizer_job_::result() { seq_jobs_.clear(); } - LOG_TRACE << path_ << " -> " << best_; + LOG_TRACE << path_ << " -> " + << best_.to_string([this](fragment_category::value_type c) { + return std::string(mgr_.category_name(c)); + }); - return mgr_.category(best_); + return best_; } categorizer_job::categorizer_job() = default; @@ -151,8 +164,7 @@ categorizer_job::categorizer_job(std::unique_ptr impl) : impl_{std::move(impl)} {} template -class categorizer_manager_ final : public categorizer_manager::impl, - public categorizer_manager_private { +class categorizer_manager_ final : public categorizer_manager_private { public: categorizer_manager_(logger& lgr) : lgr_{lgr} @@ -162,14 +174,15 @@ class categorizer_manager_ final : public categorizer_manager::impl, void add(std::shared_ptr c) override; categorizer_job job(std::filesystem::path const& path) const override; - std::string_view category_name(file_category c) const override; + std::string_view + category_name(fragment_category::value_type c) const override; std::vector> const& categorizers() const override { return categorizers_; } - file_category category(std::string_view cat) const override { + fragment_category::value_type category(std::string_view cat) const override { auto it = catmap_.find(cat); DWARFS_CHECK(it != catmap_.end(), fmt::format("unknown category: {}", cat)); return it->second; @@ -188,7 +201,7 @@ class categorizer_manager_ final : public categorizer_manager::impl, LOG_PROXY_DECL(LoggerPolicy); std::vector> categorizers_; std::vector categories_; - std::unordered_map catmap_; + std::unordered_map catmap_; }; template @@ -210,9 +223,9 @@ categorizer_job categorizer_manager_::job( } template -std::string_view -categorizer_manager_::category_name(file_category c) const { - return DWARFS_NOTHROW(categories_.at(c.value())); +std::string_view categorizer_manager_::category_name( + fragment_category::value_type c) const { + return DWARFS_NOTHROW(categories_.at(c)); } categorizer_manager::categorizer_manager(logger& lgr) diff --git a/src/dwarfs/categorizer/binary_categorizer.cpp b/src/dwarfs/categorizer/binary_categorizer.cpp index c7c2dc589..075cb85af 100644 --- a/src/dwarfs/categorizer/binary_categorizer.cpp +++ b/src/dwarfs/categorizer/binary_categorizer.cpp @@ -58,9 +58,9 @@ class binary_categorizer_ final : public binary_categorizer_base { binary_categorizer_(logger& lgr) : LOG_PROXY_INIT(lgr) {} - std::optional - categorize(std::filesystem::path const& path, - std::span data) const override; + inode_fragments + categorize(std::filesystem::path const& path, std::span data, + category_mapper const& mapper) const override; private: LOG_PROXY_DECL(LoggerPolicy); @@ -74,10 +74,12 @@ std::span binary_categorizer_base::categories() const { } template -std::optional -binary_categorizer_::categorize(std::filesystem::path const&, - std::span data - [[maybe_unused]]) const { +inode_fragments binary_categorizer_::categorize( + std::filesystem::path const&, + std::span data [[maybe_unused]], + category_mapper const& /*mapper*/) const { + inode_fragments fragments; + #ifndef _WIN32 auto p = data.data(); if (data.size() >= EI_NIDENT && ::memcmp(p, ELFMAG, 4) == 0) { @@ -101,7 +103,7 @@ binary_categorizer_::categorize(std::filesystem::path const&, } #endif - return std::nullopt; + return fragments; } class binary_categorizer_factory : public categorizer_factory { diff --git a/src/dwarfs/categorizer/incompressible_categorizer.cpp b/src/dwarfs/categorizer/incompressible_categorizer.cpp index 665b4cf61..976b2b054 100644 --- a/src/dwarfs/categorizer/incompressible_categorizer.cpp +++ b/src/dwarfs/categorizer/incompressible_categorizer.cpp @@ -57,10 +57,12 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job { incompressible_categorizer_job_(logger& lgr, incompressible_categorizer_config const& cfg, std::filesystem::path const& path, - size_t total_size) + size_t total_size, + category_mapper const& mapper) : LOG_PROXY_INIT(lgr) , cfg_{cfg} - , path_{path} { + , path_{path} + , mapper_{mapper} { input_.reserve(total_size < block_size ? total_size : block_size); state_ = ::malloc(LZ4_sizeofState()); } @@ -77,7 +79,8 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job { } } - std::optional result() override { + inode_fragments result() override { + inode_fragments fragments; if (!input_.empty()) { compress(); } @@ -88,9 +91,11 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job { if (total_blocks_ > 0 && (total_output_size_ >= cfg_.max_ratio_size * total_input_size_ || incompressible_blocks_ >= cfg_.max_ratio_blocks * total_blocks_)) { - return INCOMPRESSIBLE_CATEGORY; + fragments.emplace_back( + fragment_category(mapper_(INCOMPRESSIBLE_CATEGORY)), + total_input_size_); } - return std::nullopt; + return fragments; } private: @@ -139,6 +144,7 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job { size_t incompressible_blocks_{0}; incompressible_categorizer_config const& cfg_; std::filesystem::path const& path_; + category_mapper const& mapper_; }; class incompressible_categorizer_ final : public sequential_categorizer { @@ -148,7 +154,8 @@ class incompressible_categorizer_ final : public sequential_categorizer { std::span categories() const override; std::unique_ptr - job(std::filesystem::path const& path, size_t total_size) const override; + job(std::filesystem::path const& path, size_t total_size, + category_mapper const& mapper) const override; private: logger& lgr_; @@ -170,7 +177,8 @@ incompressible_categorizer_::categories() const { std::unique_ptr incompressible_categorizer_::job(std::filesystem::path const& path, - size_t total_size) const { + size_t total_size, + category_mapper const& mapper) const { if (total_size < config_.min_input_size) { return nullptr; } @@ -178,7 +186,7 @@ incompressible_categorizer_::job(std::filesystem::path const& path, return make_unique_logging_object(lgr_, config_, path, - total_size); + total_size, mapper); } class incompressible_categorizer_factory : public categorizer_factory { diff --git a/src/dwarfs/categorizer/libmagic_categorizer.cpp b/src/dwarfs/categorizer/libmagic_categorizer.cpp index c26b961c6..47324906a 100644 --- a/src/dwarfs/categorizer/libmagic_categorizer.cpp +++ b/src/dwarfs/categorizer/libmagic_categorizer.cpp @@ -143,9 +143,9 @@ class libmagic_categorizer_ final : public libmagic_categorizer_base { } } - std::optional - categorize(std::filesystem::path const& path, - std::span data) const override; + inode_fragments + categorize(std::filesystem::path const& path, std::span data, + category_mapper const& mapper) const override; private: LOG_PROXY_DECL(LoggerPolicy); @@ -162,15 +162,17 @@ libmagic_categorizer_base::categories() const { } template -std::optional libmagic_categorizer_::categorize( - std::filesystem::path const& path, std::span data) const { +inode_fragments libmagic_categorizer_::categorize( + std::filesystem::path const& path, std::span data, + category_mapper const& /*mapper*/) const { + inode_fragments fragments; // TODO: actually fill this :-) auto id = m_.identify(data); LOG_DEBUG << path << " -> (magic) " << id; { auto wlock = mimetypes_.wlock(); ++(*wlock)[id]; } - return std::nullopt; + return fragments; } class libmagic_categorizer_factory : public categorizer_factory { diff --git a/src/dwarfs/inode_fragments.cpp b/src/dwarfs/inode_fragments.cpp new file mode 100644 index 000000000..a09721a64 --- /dev/null +++ b/src/dwarfs/inode_fragments.cpp @@ -0,0 +1,74 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include + +#include "dwarfs/inode_fragments.h" + +namespace dwarfs { + +std::ostream& +inode_fragments::to_stream(std::ostream& os, + mapper_function_type const& mapper) const { + if (empty()) { + os << "(empty)"; + } else { + os << "["; + bool first = true; + + for (auto const& f : span()) { + if (first) { + first = false; + } else { + os << ", "; + } + + os << "("; + + auto const& cat = f.category(); + if (mapper) { + os << mapper(cat.value()); + } else { + os << cat.value(); + } + + if (cat.has_subcategory()) { + os << "/" << cat.subcategory(); + } + + os << ", " << f.size() << ")"; + } + + os << "]"; + } + + return os; +} + +std::string +inode_fragments::to_string(mapper_function_type const& mapper) const { + std::ostringstream oss; + to_stream(oss, mapper); + return oss.str(); +} + +} // namespace dwarfs diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index 3d05bce0e..44eb7e2d9 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -155,6 +155,7 @@ class inode_ : public inode { if (mm) { if (catjob) { + catjob.set_total_size(mm->size()); catjob.categorize_random_access(mm->span()); } @@ -201,7 +202,7 @@ class inode_ : public inode { } if (catjob) { - category_ = catjob.result(); + fragments_ = catjob.result(); } } @@ -228,12 +229,14 @@ class inode_ : public inode { vec.insert(vec.end(), chunks_.begin(), chunks_.end()); } - file_category category() const override { return category_; } + inode_fragments const& fragments() const override { return fragments_; } private: + // TODO: can we move optional stuff (e.g. nilsimsa_similarity_hash_) out of + // here? std::optional num_; uint32_t similarity_hash_{0}; - file_category category_; + inode_fragments fragments_; files_vector files_; std::vector chunks_; nilsimsa::hash_type nilsimsa_similarity_hash_; @@ -278,23 +281,26 @@ class inode_manager_ final : public inode_manager::impl { } } - std::vector> + std::vector> category_counts() const override { - std::unordered_map tmp; + std::unordered_map tmp; for (auto const& i : inodes_) { - ++tmp[i->category().value()]; + if (auto const& fragments = i->fragments(); !fragments.empty()) { + for (auto const& frag : fragments.span()) { + ++tmp[frag.category().value()]; + } + } } - std::vector> rv; + std::vector> rv; for (auto const& [k, v] : tmp) { rv.emplace_back(k, v); } - std::sort(rv.begin(), rv.end(), [](auto const& a, auto const& b) { - return a.first.value() < b.first.value(); - }); + std::sort(rv.begin(), rv.end(), + [](auto const& a, auto const& b) { return a.first < b.first; }); return rv; } From a9bf89aa0c3c002f53ae7a277d55b9ca151db927 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 18 Jul 2023 13:54:53 +0200 Subject: [PATCH 005/571] Add safety check for multi-fragment sequential categorizers --- include/dwarfs/categorizer.h | 6 ++++++ src/dwarfs/categorizer.cpp | 18 ++++++++++++++++++ src/dwarfs/categorizer/binary_categorizer.cpp | 2 ++ .../categorizer/incompressible_categorizer.cpp | 1 + .../categorizer/libmagic_categorizer.cpp | 2 ++ src/dwarfs/inode_manager.cpp | 4 ++++ 6 files changed, 33 insertions(+) diff --git a/include/dwarfs/categorizer.h b/include/dwarfs/categorizer.h index e34405580..d7d3a0cc5 100644 --- a/include/dwarfs/categorizer.h +++ b/include/dwarfs/categorizer.h @@ -49,6 +49,7 @@ class categorizer { virtual ~categorizer() = default; virtual std::span categories() const = 0; + virtual bool is_single_fragment() const = 0; }; class random_access_categorizer : public categorizer { @@ -98,6 +99,10 @@ class categorizer_job { explicit operator bool() const { return impl_ != nullptr; } + bool has_multi_fragment_sequential_categorizers() const { + return impl_->has_multi_fragment_sequential_categorizers(); + } + class impl { public: virtual ~impl() = default; @@ -106,6 +111,7 @@ class categorizer_job { virtual void categorize_random_access(std::span data) = 0; virtual void categorize_sequential(std::span data) = 0; virtual inode_fragments result() = 0; + virtual bool has_multi_fragment_sequential_categorizers() const = 0; }; private: diff --git a/src/dwarfs/categorizer.cpp b/src/dwarfs/categorizer.cpp index 067100c07..00dcc50c9 100644 --- a/src/dwarfs/categorizer.cpp +++ b/src/dwarfs/categorizer.cpp @@ -49,6 +49,7 @@ class categorizer_manager_private : public categorizer_manager::impl { categorizers() const = 0; virtual fragment_category::value_type category(std::string_view cat) const = 0; + virtual bool has_multi_fragment_sequential_categorizers() const = 0; }; template @@ -66,6 +67,7 @@ class categorizer_job_ final : public categorizer_job::impl { void categorize_random_access(std::span data) override; void categorize_sequential(std::span data) override; inode_fragments result() override; + bool has_multi_fragment_sequential_categorizers() const override; private: LOG_PROXY_DECL(LoggerPolicy); @@ -158,6 +160,12 @@ inode_fragments categorizer_job_::result() { return best_; } +template +bool categorizer_job_< + LoggerPolicy>::has_multi_fragment_sequential_categorizers() const { + return mgr_.has_multi_fragment_sequential_categorizers(); +} + categorizer_job::categorizer_job() = default; categorizer_job::categorizer_job(std::unique_ptr impl) @@ -188,6 +196,10 @@ class categorizer_manager_ final : public categorizer_manager_private { return it->second; } + bool has_multi_fragment_sequential_categorizers() const override { + return has_multi_fragment_sequential_categorizers_; + } + private: void add_category(std::string_view cat) { if (catmap_.emplace(cat, categories_.size()).second) { @@ -202,6 +214,7 @@ class categorizer_manager_ final : public categorizer_manager_private { std::vector> categorizers_; std::vector categories_; std::unordered_map catmap_; + bool has_multi_fragment_sequential_categorizers_{false}; }; template @@ -211,6 +224,11 @@ void categorizer_manager_::add( add_category(c); } + if (!c->is_single_fragment() && + dynamic_cast(c.get())) { + has_multi_fragment_sequential_categorizers_ = true; + } + categorizers_.emplace_back(std::move(c)); } diff --git a/src/dwarfs/categorizer/binary_categorizer.cpp b/src/dwarfs/categorizer/binary_categorizer.cpp index 075cb85af..73e33b657 100644 --- a/src/dwarfs/categorizer/binary_categorizer.cpp +++ b/src/dwarfs/categorizer/binary_categorizer.cpp @@ -62,6 +62,8 @@ class binary_categorizer_ final : public binary_categorizer_base { categorize(std::filesystem::path const& path, std::span data, category_mapper const& mapper) const override; + bool is_single_fragment() const override { return false; } + private: LOG_PROXY_DECL(LoggerPolicy); }; diff --git a/src/dwarfs/categorizer/incompressible_categorizer.cpp b/src/dwarfs/categorizer/incompressible_categorizer.cpp index 976b2b054..4cc61f7a2 100644 --- a/src/dwarfs/categorizer/incompressible_categorizer.cpp +++ b/src/dwarfs/categorizer/incompressible_categorizer.cpp @@ -156,6 +156,7 @@ class incompressible_categorizer_ final : public sequential_categorizer { std::unique_ptr job(std::filesystem::path const& path, size_t total_size, category_mapper const& mapper) const override; + bool is_single_fragment() const override { return true; } private: logger& lgr_; diff --git a/src/dwarfs/categorizer/libmagic_categorizer.cpp b/src/dwarfs/categorizer/libmagic_categorizer.cpp index 47324906a..f22ccd1c5 100644 --- a/src/dwarfs/categorizer/libmagic_categorizer.cpp +++ b/src/dwarfs/categorizer/libmagic_categorizer.cpp @@ -147,6 +147,8 @@ class libmagic_categorizer_ final : public libmagic_categorizer_base { categorize(std::filesystem::path const& path, std::span data, category_mapper const& mapper) const override; + bool is_single_fragment() const override { return true; } + private: LOG_PROXY_DECL(LoggerPolicy); magic_wrapper m_; diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index 44eb7e2d9..1290365e8 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -155,6 +155,10 @@ class inode_ : public inode { if (mm) { if (catjob) { + DWARFS_CHECK( + !catjob.has_multi_fragment_sequential_categorizers(), + "multi-fragment sequential categorizers are not supported"); + catjob.set_total_size(mm->size()); catjob.categorize_random_access(mm->span()); } From c6206653d2e457aadf535674fb21abd38b8d44c5 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 18 Jul 2023 14:31:42 +0200 Subject: [PATCH 006/571] Add support for category metadata --- include/dwarfs/categorizer.h | 8 ++++++++ src/dwarfs/categorizer.cpp | 20 ++++++++++++++----- src/dwarfs/categorizer/binary_categorizer.cpp | 4 ++++ .../incompressible_categorizer.cpp | 5 +++++ .../categorizer/libmagic_categorizer.cpp | 4 ++++ 5 files changed, 36 insertions(+), 5 deletions(-) diff --git a/include/dwarfs/categorizer.h b/include/dwarfs/categorizer.h index d7d3a0cc5..7aa0d0655 100644 --- a/include/dwarfs/categorizer.h +++ b/include/dwarfs/categorizer.h @@ -30,6 +30,8 @@ #include #include +#include + #include "dwarfs/inode_fragments.h" namespace boost::program_options { @@ -50,6 +52,7 @@ class categorizer { virtual std::span categories() const = 0; virtual bool is_single_fragment() const = 0; + virtual folly::dynamic category_metadata(fragment_category c) const = 0; }; class random_access_categorizer : public categorizer { @@ -132,6 +135,10 @@ class categorizer_manager { return impl_->category_name(c); } + folly::dynamic category_metadata(fragment_category c) const { + return impl_->category_metadata(c); + } + class impl { public: virtual ~impl() = default; @@ -140,6 +147,7 @@ class categorizer_manager { virtual categorizer_job job(std::filesystem::path const& path) const = 0; virtual std::string_view category_name(fragment_category::value_type c) const = 0; + virtual folly::dynamic category_metadata(fragment_category c) const = 0; }; private: diff --git a/src/dwarfs/categorizer.cpp b/src/dwarfs/categorizer.cpp index 00dcc50c9..d50a9c9e0 100644 --- a/src/dwarfs/categorizer.cpp +++ b/src/dwarfs/categorizer.cpp @@ -185,6 +185,8 @@ class categorizer_manager_ final : public categorizer_manager_private { std::string_view category_name(fragment_category::value_type c) const override; + folly::dynamic category_metadata(fragment_category c) const override; + std::vector> const& categorizers() const override { return categorizers_; @@ -201,9 +203,9 @@ class categorizer_manager_ final : public categorizer_manager_private { } private: - void add_category(std::string_view cat) { + void add_category(std::string_view cat, size_t categorizer_index) { if (catmap_.emplace(cat, categories_.size()).second) { - categories_.emplace_back(cat); + categories_.emplace_back(cat, categorizer_index); } else { LOG_WARN << "duplicate category: " << cat; } @@ -212,7 +214,7 @@ class categorizer_manager_ final : public categorizer_manager_private { logger& lgr_; LOG_PROXY_DECL(LoggerPolicy); std::vector> categorizers_; - std::vector categories_; + std::vector> categories_; std::unordered_map catmap_; bool has_multi_fragment_sequential_categorizers_{false}; }; @@ -221,7 +223,7 @@ template void categorizer_manager_::add( std::shared_ptr c) { for (auto const& c : c->categories()) { - add_category(c); + add_category(c, categorizers_.size()); } if (!c->is_single_fragment() && @@ -243,7 +245,15 @@ categorizer_job categorizer_manager_::job( template std::string_view categorizer_manager_::category_name( fragment_category::value_type c) const { - return DWARFS_NOTHROW(categories_.at(c)); + return DWARFS_NOTHROW(categories_.at(c)).first; +} + +template +folly::dynamic categorizer_manager_::category_metadata( + fragment_category c) const { + auto categorizer = + DWARFS_NOTHROW(categorizers_.at(categories_.at(c.value()).second)); + return categorizer->category_metadata(c); } categorizer_manager::categorizer_manager(logger& lgr) diff --git a/src/dwarfs/categorizer/binary_categorizer.cpp b/src/dwarfs/categorizer/binary_categorizer.cpp index 73e33b657..e14c03a90 100644 --- a/src/dwarfs/categorizer/binary_categorizer.cpp +++ b/src/dwarfs/categorizer/binary_categorizer.cpp @@ -64,6 +64,10 @@ class binary_categorizer_ final : public binary_categorizer_base { bool is_single_fragment() const override { return false; } + folly::dynamic category_metadata(fragment_category) const override { + return folly::dynamic(); + } + private: LOG_PROXY_DECL(LoggerPolicy); }; diff --git a/src/dwarfs/categorizer/incompressible_categorizer.cpp b/src/dwarfs/categorizer/incompressible_categorizer.cpp index 4cc61f7a2..7af8fc0b8 100644 --- a/src/dwarfs/categorizer/incompressible_categorizer.cpp +++ b/src/dwarfs/categorizer/incompressible_categorizer.cpp @@ -156,8 +156,13 @@ class incompressible_categorizer_ final : public sequential_categorizer { std::unique_ptr job(std::filesystem::path const& path, size_t total_size, category_mapper const& mapper) const override; + bool is_single_fragment() const override { return true; } + folly::dynamic category_metadata(fragment_category) const override { + return folly::dynamic(); + } + private: logger& lgr_; incompressible_categorizer_config const config_; diff --git a/src/dwarfs/categorizer/libmagic_categorizer.cpp b/src/dwarfs/categorizer/libmagic_categorizer.cpp index f22ccd1c5..05dba28d5 100644 --- a/src/dwarfs/categorizer/libmagic_categorizer.cpp +++ b/src/dwarfs/categorizer/libmagic_categorizer.cpp @@ -149,6 +149,10 @@ class libmagic_categorizer_ final : public libmagic_categorizer_base { bool is_single_fragment() const override { return true; } + folly::dynamic category_metadata(fragment_category) const override { + return folly::dynamic(); + } + private: LOG_PROXY_DECL(LoggerPolicy); magic_wrapper m_; From 74159770d892a4c07aef5882b775a95411a57d27 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 18 Jul 2023 14:31:56 +0200 Subject: [PATCH 007/571] Remove default category --- src/dwarfs/categorizer.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/dwarfs/categorizer.cpp b/src/dwarfs/categorizer.cpp index d50a9c9e0..33e3b194c 100644 --- a/src/dwarfs/categorizer.cpp +++ b/src/dwarfs/categorizer.cpp @@ -39,10 +39,6 @@ using namespace std::placeholders; namespace po = boost::program_options; -namespace { -constexpr std::string_view const DEFAULT_CATEGORY{""}; -} - class categorizer_manager_private : public categorizer_manager::impl { public: virtual std::vector> const& @@ -176,9 +172,7 @@ class categorizer_manager_ final : public categorizer_manager_private { public: categorizer_manager_(logger& lgr) : lgr_{lgr} - , LOG_PROXY_INIT(lgr) { - add_category(DEFAULT_CATEGORY); - } + , LOG_PROXY_INIT(lgr) {} void add(std::shared_ptr c) override; categorizer_job job(std::filesystem::path const& path) const override; From 0fe8be7ad502075246bdf7e5cb64664904414553 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 18 Jul 2023 14:32:10 +0200 Subject: [PATCH 008/571] Add pcmaudio categorizer --- CMakeLists.txt | 1 + .../categorizer/pcmaudio_categorizer.cpp | 113 ++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 src/dwarfs/categorizer/pcmaudio_categorizer.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2e4567d76..03a058c01 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -421,6 +421,7 @@ list( LIBDWARFS_CATEGORIZER_SRC src/dwarfs/categorizer/binary_categorizer.cpp src/dwarfs/categorizer/incompressible_categorizer.cpp + src/dwarfs/categorizer/pcmaudio_categorizer.cpp ) if(LIBMAGIC_FOUND) diff --git a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp new file mode 100644 index 000000000..d0bf0c2d5 --- /dev/null +++ b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp @@ -0,0 +1,113 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "dwarfs/categorizer.h" +#include "dwarfs/error.h" +#include "dwarfs/logger.h" + +namespace dwarfs { + +namespace po = boost::program_options; + +namespace { + +constexpr std::string_view const AIFF_CATEGORY{"aiff"}; +constexpr std::string_view const WAV_CATEGORY{"wav"}; + +class pcmaudio_categorizer_base : public random_access_categorizer { + public: + std::span categories() const override; +}; + +template +class pcmaudio_categorizer_ final : public pcmaudio_categorizer_base { + public: + pcmaudio_categorizer_(logger& lgr) + : LOG_PROXY_INIT(lgr) {} + + inode_fragments + categorize(std::filesystem::path const& path, std::span data, + category_mapper const& mapper) const override; + + bool is_single_fragment() const override { return false; } + + folly::dynamic category_metadata(fragment_category) const override { + // TODO + return folly::dynamic(); + } + + private: + LOG_PROXY_DECL(LoggerPolicy); +}; + +std::span +pcmaudio_categorizer_base::categories() const { + static constexpr std::array const s_categories{ + AIFF_CATEGORY, + WAV_CATEGORY, + }; + return s_categories; +} + +template +inode_fragments pcmaudio_categorizer_::categorize( + std::filesystem::path const&, + std::span data [[maybe_unused]], + category_mapper const& /*mapper*/) const { + inode_fragments fragments; + + return fragments; +} + +class pcmaudio_categorizer_factory : public categorizer_factory { + public: + std::string_view name() const override { return "pcmaudio"; } + + std::shared_ptr + options() const override { + return nullptr; + } + + std::unique_ptr + create(logger& lgr, po::variables_map const& /*vm*/) const override { + return make_unique_logging_object(lgr); + } + + private: +}; + +} // namespace + +REGISTER_CATEGORIZER_FACTORY(pcmaudio_categorizer_factory) + +} // namespace dwarfs From f30910e79b7232258d78cd567f2622c06c869f84 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 18 Jul 2023 14:53:42 +0200 Subject: [PATCH 009/571] Allow categorizer selection on command line --- src/mkdwarfs_main.cpp | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/src/mkdwarfs_main.cpp b/src/mkdwarfs_main.cpp index e873e73bf..8fab50302 100644 --- a/src/mkdwarfs_main.cpp +++ b/src/mkdwarfs_main.cpp @@ -315,11 +315,12 @@ int mkdwarfs_main(int argc, sys_char** argv) { std::string memory_limit, script_arg, compression, header, schema_compression, metadata_compression, log_level_str, timestamp, time_resolution, order, progress_mode, recompress_opts, pack_metadata, file_hash_algo, - debug_filter, max_similarity_size, input_list_str, chmod_str; + debug_filter, max_similarity_size, input_list_str, chmod_str, + categorizer_list_str; std::vector filter; size_t num_workers, num_scanner_workers; bool no_progress = false, remove_header = false, no_section_index = false, - force_overwrite = false, enable_categorizer = false; + force_overwrite = false; unsigned level; int compress_niceness; uint16_t uid, gid; @@ -345,6 +346,12 @@ int mkdwarfs_main(int argc, sys_char** argv) { auto file_hash_desc = "choice of file hashing function (none, " + (from(hash_list) | unsplit(", ")) + ")"; + auto& catreg = categorizer_registry::instance(); + + auto categorize_desc = "enable categorizers in the given order (" + + (from(catreg.categorizer_names()) | unsplit(", ")) + + ")"; + // clang-format off po::options_description basic_opts("Options"); basic_opts.add_options() @@ -393,8 +400,9 @@ int mkdwarfs_main(int argc, sys_char** argv) { po::value(&recompress_opts)->implicit_value("all"), "recompress an existing filesystem (none, block, metadata, all)") ("categorize", - po::value(&enable_categorizer)->zero_tokens(), - "WIP enable categorizer") + po::value(&categorizer_list_str) + ->default_value("pcmaudio,incompressible"), + categorize_desc.c_str()) ("order", po::value(&order), order_desc.c_str()) @@ -514,7 +522,6 @@ int mkdwarfs_main(int argc, sys_char** argv) { .add(filesystem_opts) .add(metadata_opts); - auto& catreg = categorizer_registry::instance(); catreg.add_options(opts); po::variables_map vm; @@ -1028,11 +1035,16 @@ int mkdwarfs_main(int argc, sys_char** argv) { options.file_order.mode == file_order_mode::SIMILARITY; options.inode.with_nilsimsa = options.file_order.mode == file_order_mode::NILSIMSA; - if (enable_categorizer) { + + if (!categorizer_list_str.empty()) { + std::vector categorizer_list; + boost::split(categorizer_list, categorizer_list_str, + boost::is_any_of(",")); + options.inode.categorizer_mgr = std::make_shared(lgr); - // TODO - for (auto const& name : catreg.categorizer_names()) { + + for (auto const& name : categorizer_list) { options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm)); } } From 465f8e090cfbae54eb46adc4693a36b87e6412bf Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 18 Jul 2023 15:33:21 +0200 Subject: [PATCH 010/571] Implement metadata handling in pcmaudio categorizer --- .../categorizer/pcmaudio_categorizer.cpp | 84 +++++++++++++++++-- 1 file changed, 77 insertions(+), 7 deletions(-) diff --git a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp index d0bf0c2d5..bce7b63a6 100644 --- a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp +++ b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp @@ -30,6 +30,8 @@ #include +#include + #include "dwarfs/categorizer.h" #include "dwarfs/error.h" #include "dwarfs/logger.h" @@ -40,8 +42,76 @@ namespace po = boost::program_options; namespace { -constexpr std::string_view const AIFF_CATEGORY{"aiff"}; -constexpr std::string_view const WAV_CATEGORY{"wav"}; +constexpr std::string_view const PCMAUDIO_CATEGORY{"pcmaudio"}; + +enum class endianness : uint8_t { + BIG, + LITTLE, +}; +enum class signedness : uint8_t { + SIGNED, + UNSIGNED, +}; + +char const* endianness_string(endianness e) { + switch (e) { + case endianness::BIG: + return "big"; + case endianness::LITTLE: + return "little"; + } +} + +char const* signedness_string(signedness s) { + switch (s) { + case signedness::SIGNED: + return "signed"; + case signedness::UNSIGNED: + return "unsigned"; + } +} + +struct pcmaudio_metadata { + endianness sample_endianness; + signedness sample_signedness; + uint8_t bits_per_sample; + uint16_t number_of_channels; + + //// Sample rate should be irrelevant + // uint32_t samples_per_second; + + auto operator<=>(pcmaudio_metadata const&) const = default; +}; + +class pcmaudio_metadata_store { + public: + pcmaudio_metadata_store() = default; + + size_t add(pcmaudio_metadata const& m) { + auto it = reverse_index_.find(m); + if (it == reverse_index_.end()) { + auto r = reverse_index_.emplace(m, forward_index_.size()); + assert(r.second); + forward_index_.emplace_back(m); + it = r.first; + } + return it->second; + } + + folly::dynamic lookup(size_t ix) const { + auto const& m = DWARFS_NOTHROW(forward_index_.at(ix)); + folly::dynamic obj = folly::dynamic::object; + obj.insert("endianness", endianness_string(m.sample_endianness)); + obj.insert("signedness", signedness_string(m.sample_signedness)); + obj.insert("bits_per_sample", m.bits_per_sample); + obj.insert("number_of_channels", m.number_of_channels); + return obj; + } + + private: + std::vector forward_index_; + std::map reverse_index_; +}; class pcmaudio_categorizer_base : public random_access_categorizer { public: @@ -60,20 +130,20 @@ class pcmaudio_categorizer_ final : public pcmaudio_categorizer_base { bool is_single_fragment() const override { return false; } - folly::dynamic category_metadata(fragment_category) const override { - // TODO - return folly::dynamic(); + folly::dynamic category_metadata(fragment_category c) const override { + DWARFS_CHECK(c.has_subcategory(), "expected pcmaudio to have subcategory"); + return meta_.rlock()->lookup(c.subcategory()); } private: LOG_PROXY_DECL(LoggerPolicy); + folly::Synchronized mutable meta_; }; std::span pcmaudio_categorizer_base::categories() const { static constexpr std::array const s_categories{ - AIFF_CATEGORY, - WAV_CATEGORY, + PCMAUDIO_CATEGORY, }; return s_categories; } From e08bbb5e9465f0ce6506b895a211567bdab16f0d Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 18 Jul 2023 18:40:04 +0200 Subject: [PATCH 011/571] Pass category name to category_metadata() --- include/dwarfs/categorizer.h | 3 ++- include/dwarfs/inode_fragments.h | 2 ++ src/dwarfs/categorizer.cpp | 6 +++--- src/dwarfs/categorizer/binary_categorizer.cpp | 3 ++- src/dwarfs/categorizer/incompressible_categorizer.cpp | 3 ++- src/dwarfs/categorizer/libmagic_categorizer.cpp | 3 ++- src/dwarfs/categorizer/pcmaudio_categorizer.cpp | 11 ++++++++--- 7 files changed, 21 insertions(+), 10 deletions(-) diff --git a/include/dwarfs/categorizer.h b/include/dwarfs/categorizer.h index 7aa0d0655..0d0847280 100644 --- a/include/dwarfs/categorizer.h +++ b/include/dwarfs/categorizer.h @@ -52,7 +52,8 @@ class categorizer { virtual std::span categories() const = 0; virtual bool is_single_fragment() const = 0; - virtual folly::dynamic category_metadata(fragment_category c) const = 0; + virtual folly::dynamic category_metadata(std::string_view category_name, + fragment_category c) const = 0; }; class random_access_categorizer : public categorizer { diff --git a/include/dwarfs/inode_fragments.h b/include/dwarfs/inode_fragments.h index 319132271..2c54d25a8 100644 --- a/include/dwarfs/inode_fragments.h +++ b/include/dwarfs/inode_fragments.h @@ -64,6 +64,8 @@ class inode_fragments { bool empty() const { return fragments_.empty(); } + void clear() { fragments_.clear(); } + explicit operator bool() const { return !empty(); } std::ostream& diff --git a/src/dwarfs/categorizer.cpp b/src/dwarfs/categorizer.cpp index 33e3b194c..781939415 100644 --- a/src/dwarfs/categorizer.cpp +++ b/src/dwarfs/categorizer.cpp @@ -245,9 +245,9 @@ std::string_view categorizer_manager_::category_name( template folly::dynamic categorizer_manager_::category_metadata( fragment_category c) const { - auto categorizer = - DWARFS_NOTHROW(categorizers_.at(categories_.at(c.value()).second)); - return categorizer->category_metadata(c); + auto cat = DWARFS_NOTHROW(categories_.at(c.value())); + auto categorizer = DWARFS_NOTHROW(categorizers_.at(cat.second)); + return categorizer->category_metadata(cat.first, c); } categorizer_manager::categorizer_manager(logger& lgr) diff --git a/src/dwarfs/categorizer/binary_categorizer.cpp b/src/dwarfs/categorizer/binary_categorizer.cpp index e14c03a90..3bab54913 100644 --- a/src/dwarfs/categorizer/binary_categorizer.cpp +++ b/src/dwarfs/categorizer/binary_categorizer.cpp @@ -64,7 +64,8 @@ class binary_categorizer_ final : public binary_categorizer_base { bool is_single_fragment() const override { return false; } - folly::dynamic category_metadata(fragment_category) const override { + folly::dynamic + category_metadata(std::string_view, fragment_category) const override { return folly::dynamic(); } diff --git a/src/dwarfs/categorizer/incompressible_categorizer.cpp b/src/dwarfs/categorizer/incompressible_categorizer.cpp index 7af8fc0b8..f9543401f 100644 --- a/src/dwarfs/categorizer/incompressible_categorizer.cpp +++ b/src/dwarfs/categorizer/incompressible_categorizer.cpp @@ -159,7 +159,8 @@ class incompressible_categorizer_ final : public sequential_categorizer { bool is_single_fragment() const override { return true; } - folly::dynamic category_metadata(fragment_category) const override { + folly::dynamic + category_metadata(std::string_view, fragment_category) const override { return folly::dynamic(); } diff --git a/src/dwarfs/categorizer/libmagic_categorizer.cpp b/src/dwarfs/categorizer/libmagic_categorizer.cpp index 05dba28d5..d072fda62 100644 --- a/src/dwarfs/categorizer/libmagic_categorizer.cpp +++ b/src/dwarfs/categorizer/libmagic_categorizer.cpp @@ -149,7 +149,8 @@ class libmagic_categorizer_ final : public libmagic_categorizer_base { bool is_single_fragment() const override { return true; } - folly::dynamic category_metadata(fragment_category) const override { + folly::dynamic + category_metadata(std::string_view, fragment_category) const override { return folly::dynamic(); } diff --git a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp index bce7b63a6..f07cf614b 100644 --- a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp +++ b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp @@ -130,9 +130,14 @@ class pcmaudio_categorizer_ final : public pcmaudio_categorizer_base { bool is_single_fragment() const override { return false; } - folly::dynamic category_metadata(fragment_category c) const override { - DWARFS_CHECK(c.has_subcategory(), "expected pcmaudio to have subcategory"); - return meta_.rlock()->lookup(c.subcategory()); + folly::dynamic category_metadata(std::string_view category_name, + fragment_category c) const override { + if (category_name == PCMAUDIO_CATEGORY) { + DWARFS_CHECK(c.has_subcategory(), + "expected PCMAUDIO to have subcategory"); + return meta_.rlock()->lookup(c.subcategory()); + } + return folly::dynamic(); } private: From 4bcbb3bfe9866bf412cac7bb2299ad536f26196a Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 18 Jul 2023 18:40:15 +0200 Subject: [PATCH 012/571] CAF format support --- .../categorizer/pcmaudio_categorizer.cpp | 404 +++++++++++++++++- 1 file changed, 400 insertions(+), 4 deletions(-) diff --git a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp index f07cf614b..a17dd7081 100644 --- a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp +++ b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp @@ -31,6 +31,7 @@ #include #include +#include #include "dwarfs/categorizer.h" #include "dwarfs/error.h" @@ -38,21 +39,31 @@ namespace dwarfs { +namespace fs = std::filesystem; namespace po = boost::program_options; namespace { +constexpr std::string_view const METADATA_CATEGORY{"metadata"}; constexpr std::string_view const PCMAUDIO_CATEGORY{"pcmaudio"}; +constexpr size_t const MIN_PCMAUDIO_SIZE{512}; + enum class endianness : uint8_t { BIG, LITTLE, }; + enum class signedness : uint8_t { SIGNED, UNSIGNED, }; +enum class padding : uint8_t { + LSB, + MSB, +}; + char const* endianness_string(endianness e) { switch (e) { case endianness::BIG: @@ -71,10 +82,21 @@ char const* signedness_string(signedness s) { } } +char const* padding_string(padding p) { + switch (p) { + case padding::LSB: + return "lsb"; + case padding::MSB: + return "msb"; + } +} + struct pcmaudio_metadata { endianness sample_endianness; signedness sample_signedness; + padding sample_padding; uint8_t bits_per_sample; + uint8_t bytes_per_sample; uint16_t number_of_channels; //// Sample rate should be irrelevant @@ -83,6 +105,16 @@ struct pcmaudio_metadata { auto operator<=>(pcmaudio_metadata const&) const = default; }; +std::ostream& operator<<(std::ostream& os, pcmaudio_metadata const& m) { + os << "[" << endianness_string(m.sample_endianness) << ", " + << signedness_string(m.sample_signedness) << ", " + << padding_string(m.sample_padding) << ", " + << "bits=" << static_cast(m.bits_per_sample) << ", " + << "bytes=" << static_cast(m.bytes_per_sample) << ", " + << "channels=" << static_cast(m.number_of_channels) << "]"; + return os; +} + class pcmaudio_metadata_store { public: pcmaudio_metadata_store() = default; @@ -103,6 +135,8 @@ class pcmaudio_metadata_store { folly::dynamic obj = folly::dynamic::object; obj.insert("endianness", endianness_string(m.sample_endianness)); obj.insert("signedness", signedness_string(m.sample_signedness)); + obj.insert("padding", padding_string(m.sample_padding)); + obj.insert("bytes_per_sample", m.bytes_per_sample); obj.insert("bits_per_sample", m.bits_per_sample); obj.insert("number_of_channels", m.number_of_channels); return obj; @@ -125,7 +159,7 @@ class pcmaudio_categorizer_ final : public pcmaudio_categorizer_base { : LOG_PROXY_INIT(lgr) {} inode_fragments - categorize(std::filesystem::path const& path, std::span data, + categorize(fs::path const& path, std::span data, category_mapper const& mapper) const override; bool is_single_fragment() const override { return false; } @@ -141,6 +175,19 @@ class pcmaudio_categorizer_ final : public pcmaudio_categorizer_base { } private: + bool check_aiff(inode_fragments& frag, fs::path const& path, + std::span data, + category_mapper const& mapper) const; + bool + check_caf(inode_fragments& frag, fs::path const& path, + std::span data, category_mapper const& mapper) const; + bool + check_wav(inode_fragments& frag, fs::path const& path, + std::span data, category_mapper const& mapper) const; + bool check_wav64(inode_fragments& frag, fs::path const& path, + std::span data, + category_mapper const& mapper) const; + LOG_PROXY_DECL(LoggerPolicy); folly::Synchronized mutable meta_; }; @@ -148,18 +195,367 @@ class pcmaudio_categorizer_ final : public pcmaudio_categorizer_base { std::span pcmaudio_categorizer_base::categories() const { static constexpr std::array const s_categories{ + METADATA_CATEGORY, PCMAUDIO_CATEGORY, }; return s_categories; } +template +bool pcmaudio_categorizer_::check_aiff( + inode_fragments& frag, fs::path const& path, std::span data, + category_mapper const& mapper) const { + if (std::memcmp(data.data(), "FORM", 4) != 0 || + std::memcmp(data.data() + 8, "AIFF", 4) != 0) { + return false; + } + + FOLLY_PACK_PUSH + + struct chk_hdr_t { + char id[4]; + uint32_t size; + } FOLLY_PACK_ATTR; + + struct comm_chk_t { + uint16_t num_chan; + uint32_t num_sample_frames; + uint16_t sample_size; + // long double sample_rate; // we can't pack this :/ + } FOLLY_PACK_ATTR; + + struct ssnd_chk_t { + uint32_t offset; + uint32_t block_size; + } FOLLY_PACK_ATTR; + + FOLLY_PACK_POP + + static_assert(sizeof(chk_hdr_t) == 8); + static_assert(sizeof(comm_chk_t) == 8); + static_assert(sizeof(ssnd_chk_t) == 8); + + bool meta_valid{false}; + uint32_t num_sample_frames; + pcmaudio_metadata meta; + size_t pos = 12; + chk_hdr_t chk_hdr; + + while (pos + sizeof(chk_hdr) <= data.size()) { + std::memcpy(&chk_hdr, data.data() + pos, sizeof(chk_hdr)); + uint32_t chk_size = folly::Endian::big(chk_hdr.size); + + LOG_TRACE << "[AIFF] " << path << ": " << std::string_view(chk_hdr.id, 4) + << " (len=" << chk_size << ")"; + + if (pos + sizeof(chk_hdr) + chk_size > data.size()) { + LOG_WARN << "[AIFF] " << path << ": unexpected end of file"; + // corrupt AIFF? -> skip + return false; + } + + if (std::memcmp(chk_hdr.id, "COMM", 4) == 0) { + if (chk_size != 18) { + LOG_WARN << "[AIFF] " << path + << ": unexpected size for COMM chunk: " << chk_size + << " (expected 18)"; + // corrupt AIFF? -> skip + return false; + } + + if (meta_valid) { + LOG_WARN << "[AIFF] " << path << ": unexpected second COMM chunk"; + // corrupt AIFF? -> skip + return false; + } + + comm_chk_t comm; + std::memcpy(&comm, data.data() + pos + sizeof(chk_hdr), sizeof(comm)); + + meta.sample_endianness = endianness::BIG; + meta.sample_signedness = signedness::SIGNED; + meta.sample_padding = padding::LSB; + meta.bits_per_sample = folly::Endian::big(comm.sample_size); + meta.bytes_per_sample = (meta.bits_per_sample + 7) / 8; + meta.number_of_channels = folly::Endian::big(comm.num_chan); + num_sample_frames = folly::Endian::big(comm.num_sample_frames); + + meta_valid = true; + + LOG_TRACE << "[AIFF] " << path << ": meta=" << meta; + } else if (std::memcmp(chk_hdr.id, "SSND", 4) == 0) { + if (!meta_valid) { + LOG_WARN << "[AIFF] " << path << ": got SSND chunk without COMM chunk"; + // corrupt AIFF? -> skip + return false; + } + + ssnd_chk_t ssnd; + std::memcpy(&ssnd, data.data() + pos + sizeof(chk_hdr), sizeof(ssnd)); + ssnd.offset = folly::Endian::big(ssnd.offset); + ssnd.block_size = folly::Endian::big(ssnd.block_size); + + size_t pcm_start = pos + sizeof(chk_hdr) + sizeof(ssnd) + ssnd.offset; + size_t pcm_length = + num_sample_frames * (meta.number_of_channels * meta.bytes_per_sample); + + if (sizeof(ssnd) + ssnd.offset + pcm_length > chk_size) { + LOG_WARN << "[AIFF] " << path + << ": SSND invalid chunk size (offset=" << ssnd.offset + << ", pcm_len=" << pcm_length << ", chk_size" << chk_size + << ")"; + // corrupt AIFF? -> skip + return false; + } + + fragment_category::value_type subcategory = meta_.wlock()->add(meta); + + frag.emplace_back(fragment_category(mapper(METADATA_CATEGORY)), + pcm_start); + frag.emplace_back( + fragment_category(mapper(PCMAUDIO_CATEGORY), subcategory), + pcm_length); + + if (pcm_start + pcm_length < data.size()) { + frag.emplace_back(fragment_category(mapper(METADATA_CATEGORY)), + data.size() - (pcm_start + pcm_length)); + } + + return true; + } + + pos += sizeof(chk_hdr) + chk_size; + } + + return false; +} + +template +bool pcmaudio_categorizer_::check_caf( + inode_fragments& frag, fs::path const& path, std::span data, + category_mapper const& mapper) const { + if (std::memcmp(data.data(), "caff", 4) != 0) { + return false; + } + + FOLLY_PACK_PUSH + + struct caff_hdr_t { + uint16_t version; + uint16_t flags; + } FOLLY_PACK_ATTR; + + struct chk_hdr_t { + char id[4]; + uint64_t size; + } FOLLY_PACK_ATTR; + + struct format_chk_t { + double sample_rate; + char format_id[4]; + uint32_t format_flags; + uint32_t bytes_per_packet; + uint32_t frames_per_packet; + uint32_t channels_per_frame; + uint32_t bits_per_channel; + } FOLLY_PACK_ATTR; + + struct data_chk_t { + uint32_t edit_count; + } FOLLY_PACK_ATTR; + + FOLLY_PACK_POP + + static_assert(sizeof(caff_hdr_t) == 4); + static_assert(sizeof(chk_hdr_t) == 12); + static_assert(sizeof(format_chk_t) == 32); + static_assert(sizeof(data_chk_t) == 4); + + static constexpr uint32_t const kCAFLinearPCMFormatFlagIsFloat{1L << 0}; + static constexpr uint32_t const kCAFLinearPCMFormatFlagIsLittleEndian{1L + << 1}; + + caff_hdr_t caff_hdr; + std::memcpy(&caff_hdr, data.data() + 4, sizeof(caff_hdr)); + caff_hdr.version = folly::Endian::big(caff_hdr.version); + caff_hdr.flags = folly::Endian::big(caff_hdr.flags); + + if (caff_hdr.version != 1 || caff_hdr.flags != 0) { + LOG_WARN << "[CAF] " << path + << ": unsupported file version/flags: " << caff_hdr.version << "/" + << caff_hdr.flags; + return false; + } + + bool meta_valid{false}; + pcmaudio_metadata meta; + size_t pos = 8; + chk_hdr_t chk_hdr; + + while (pos + sizeof(chk_hdr) <= data.size()) { + std::memcpy(&chk_hdr, data.data() + pos, sizeof(chk_hdr)); + uint64_t chk_size = folly::Endian::big(chk_hdr.size); + + LOG_TRACE << "[CAF] " << path << ": " << std::string_view(chk_hdr.id, 4) + << " (len=" << chk_size << ")"; + + if (chk_size == std::numeric_limits::max() && + std::memcmp(chk_hdr.id, "data", 4) == 0) { + chk_size = data.size() - (pos + sizeof(chk_hdr)); + } + + if (pos + sizeof(chk_hdr) + chk_size > data.size()) { + LOG_WARN << "[CAF] " << path << ": unexpected end of file"; + // corrupt CAF? -> skip + return false; + } + + if (std::memcmp(chk_hdr.id, "desc", 4) == 0) { + if (chk_size != sizeof(format_chk_t)) { + LOG_WARN << "[CAF] " << path + << ": unexpected size for desc chunk: " << chk_size + << " (expected " << sizeof(format_chk_t) << ")"; + // corrupt CAF? -> skip + return false; + } + + if (meta_valid) { + LOG_WARN << "[CAF] " << path << ": unexpected second desc chunk"; + // corrupt CAF? -> skip + return false; + } + + format_chk_t fmt; + std::memcpy(&fmt, data.data() + pos + sizeof(chk_hdr), sizeof(fmt)); + + if (std::memcmp(fmt.format_id, "lpcm", 4) != 0) { + // TODO: alaw, ulaw? + LOG_DEBUG << "[CAF] " << path << ": found compressed format"; + return false; + } + + fmt.format_flags = folly::Endian::big(fmt.format_flags); + + if (fmt.format_flags & kCAFLinearPCMFormatFlagIsFloat) { + LOG_DEBUG << "[CAF] " << path << ": floating point is unsupported"; + return false; + } + + fmt.frames_per_packet = folly::Endian::big(fmt.frames_per_packet); + + if (fmt.frames_per_packet != 1) { + LOG_WARN << "[CAF] " << path + << ": unsupported frames/packet: " << fmt.frames_per_packet; + return false; + } + + fmt.bytes_per_packet = folly::Endian::big(fmt.bytes_per_packet); + + meta.sample_endianness = + (fmt.format_flags & kCAFLinearPCMFormatFlagIsLittleEndian) + ? endianness::LITTLE + : endianness::BIG; + meta.sample_signedness = signedness::SIGNED; + meta.sample_padding = padding::LSB; + meta.bits_per_sample = folly::Endian::big(fmt.bits_per_channel); + meta.number_of_channels = folly::Endian::big(fmt.channels_per_frame); + meta.bytes_per_sample = fmt.bytes_per_packet / meta.number_of_channels; + + if (fmt.bytes_per_packet % meta.number_of_channels != 0) { + LOG_WARN << "[CAF] " << path + << ": unsupported packet size: " << fmt.bytes_per_packet + << " (" << meta.number_of_channels << " channels)"; + return false; + } + + meta_valid = true; + + LOG_TRACE << "[CAF] " << path << ": meta=" << meta; + } else if (std::memcmp(chk_hdr.id, "data", 4) == 0) { + if (!meta_valid) { + LOG_WARN << "[CAF] " << path << ": got data chunk without desc chunk"; + // corrupt CAF? -> skip + return false; + } + + size_t pcm_start = pos + sizeof(chk_hdr) + sizeof(data_chk_t); + size_t pcm_length = chk_size - sizeof(data_chk_t); + + if (pcm_length % (meta.number_of_channels * meta.bytes_per_sample)) { + LOG_WARN << "[CAF] " << path + << ": data chunk size mismatch (pcm_len=" << pcm_length + << ", #chan=" << meta.number_of_channels + << ", bytes_per_sample=" << meta.bytes_per_sample << ")"; + // corrupt CAF? -> skip + return false; + } + + fragment_category::value_type subcategory = meta_.wlock()->add(meta); + + frag.emplace_back(fragment_category(mapper(METADATA_CATEGORY)), + pcm_start); + frag.emplace_back( + fragment_category(mapper(PCMAUDIO_CATEGORY), subcategory), + pcm_length); + + if (pcm_start + pcm_length < data.size()) { + frag.emplace_back(fragment_category(mapper(METADATA_CATEGORY)), + data.size() - (pcm_start + pcm_length)); + } + + return true; + } + + pos += sizeof(chk_hdr) + chk_size; + } + + return false; +} + +template +bool pcmaudio_categorizer_::check_wav( + inode_fragments& frag, fs::path const& path, std::span data, + category_mapper const& mapper) const { + if (std::memcmp(data.data(), "RIFF", 4) != 0) { + return false; + } + return false; +} + +template +bool pcmaudio_categorizer_::check_wav64( + inode_fragments& frag, fs::path const& path, std::span data, + category_mapper const& mapper) const { + if (std::memcmp(data.data(), "riff", 4) != 0) { + return false; + } + return false; +} + template inode_fragments pcmaudio_categorizer_::categorize( - std::filesystem::path const&, - std::span data [[maybe_unused]], - category_mapper const& /*mapper*/) const { + fs::path const& path, std::span data, + category_mapper const& mapper) const { inode_fragments fragments; + if (data.size() >= MIN_PCMAUDIO_SIZE) { + for (auto f : { + // clang-format off + &pcmaudio_categorizer_::check_aiff, + &pcmaudio_categorizer_::check_caf, + &pcmaudio_categorizer_::check_wav, + &pcmaudio_categorizer_::check_wav64, + // clang-format on + }) { + if ((this->*f)(fragments, path, data, mapper)) { + break; + } + + // clean up + fragments.clear(); + } + } + return fragments; } From 0d25c6e70407a7e37cc4e12ce39bf5e3a2e78c96 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 18 Jul 2023 20:14:33 +0200 Subject: [PATCH 013/571] Basic categorizer fuzzer --- CMakeLists.txt | 7 +++++ test/fuzz_categorizers.cpp | 58 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 test/fuzz_categorizers.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 03a058c01..2db2ec428 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ include(CheckCXXSourceCompiles) option(WITH_TESTS "build with tests" OFF) option(WITH_BENCHMARKS "build with benchmarks" OFF) +option(WITH_FUZZ "build with fuzzing binaries" OFF) option(ENABLE_PERFMON "enable performance monitor in all tools" ON) if(WIN32) set(PREFER_SYSTEM_LIBFMT ON) @@ -586,6 +587,12 @@ if(WITH_BENCHMARKS) list(APPEND BINARY_TARGETS dwarfs_benchmark) endif() +if(WITH_FUZZ) + add_executable(fuzz_categorizers test/fuzz_categorizers.cpp) + target_link_libraries(fuzz_categorizers "$") + list(APPEND BINARY_TARGETS fuzz_categorizers) +endif() + if(WITH_MAN_PAGES) list(REMOVE_DUPLICATES MAN_DIRS) add_custom_target(manpages ALL DEPENDS ${MAN_PAGES}) diff --git a/test/fuzz_categorizers.cpp b/test/fuzz_categorizers.cpp new file mode 100644 index 000000000..973493266 --- /dev/null +++ b/test/fuzz_categorizers.cpp @@ -0,0 +1,58 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include + +#include + +#include "dwarfs/categorizer.h" +#include "dwarfs/logger.h" +#include "dwarfs/mmap.h" + +using namespace dwarfs; + +int main(int argc, char** argv) { + if (argc < 2) { + return 1; + } + + stream_logger lgr(std::cerr); + + auto& catreg = categorizer_registry::instance(); + auto catmgr = std::make_shared(lgr); + + boost::program_options::variables_map vm; + catmgr->add(catreg.create(lgr, "pcmaudio", vm)); + +#ifdef __AFL_LOOP + while (__AFL_LOOP(10000)) +#endif + { + std::filesystem::path p(argv[1]); + auto mm = mmap(p); + auto job = catmgr->job(p); + job.set_total_size(mm.size()); + job.categorize_random_access(mm.span()); + auto res [[maybe_unused]] = job.result(); + } + + return 0; +} From c17ab2b44afea4f3acde346b47b6bef4c5fe0548 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 18 Jul 2023 20:14:55 +0200 Subject: [PATCH 014/571] Fix a few bugs found by fuzzing --- .../categorizer/pcmaudio_categorizer.cpp | 41 ++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp index a17dd7081..989a6f6e1 100644 --- a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp +++ b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp @@ -20,6 +20,7 @@ */ #include +#include #include #include #include @@ -47,7 +48,7 @@ namespace { constexpr std::string_view const METADATA_CATEGORY{"metadata"}; constexpr std::string_view const PCMAUDIO_CATEGORY{"pcmaudio"}; -constexpr size_t const MIN_PCMAUDIO_SIZE{512}; +constexpr size_t const MIN_PCMAUDIO_SIZE{64}; enum class endianness : uint8_t { BIG, @@ -280,6 +281,17 @@ bool pcmaudio_categorizer_::check_aiff( meta.number_of_channels = folly::Endian::big(comm.num_chan); num_sample_frames = folly::Endian::big(comm.num_sample_frames); + if (meta.bits_per_sample < 8 || meta.bits_per_sample > 32) { + LOG_WARN << "[AIFF] " << path + << ": unsupported bits per sample: " << meta.bits_per_sample; + return false; + } + + if (meta.number_of_channels == 0) { + LOG_WARN << "[AIFF] " << path << ": file has no audio channels"; + return false; + } + meta_valid = true; LOG_TRACE << "[AIFF] " << path << ": meta=" << meta; @@ -459,7 +471,28 @@ bool pcmaudio_categorizer_::check_caf( meta.sample_padding = padding::LSB; meta.bits_per_sample = folly::Endian::big(fmt.bits_per_channel); meta.number_of_channels = folly::Endian::big(fmt.channels_per_frame); - meta.bytes_per_sample = fmt.bytes_per_packet / meta.number_of_channels; + + if (meta.bits_per_sample < 8 || meta.bits_per_sample > 32) { + LOG_WARN << "[CAF] " << path + << ": unsupported bits per sample: " << meta.bits_per_sample; + return false; + } + + if (meta.number_of_channels == 0) { + LOG_WARN << "[CAF] " << path << ": file has no audio channels"; + return false; + } + + if (fmt.bytes_per_packet == 0) { + LOG_WARN << "[CAF] " << path << ": bytes per packet is zero"; + return false; + } + + if (fmt.bytes_per_packet > 4 * meta.number_of_channels) { + LOG_WARN << "[CAF] " << path + << ": bytes per packet out of range: " << fmt.bytes_per_packet; + return false; + } if (fmt.bytes_per_packet % meta.number_of_channels != 0) { LOG_WARN << "[CAF] " << path @@ -468,6 +501,10 @@ bool pcmaudio_categorizer_::check_caf( return false; } + meta.bytes_per_sample = fmt.bytes_per_packet / meta.number_of_channels; + + assert(meta.bytes_per_sample > 0); + meta_valid = true; LOG_TRACE << "[CAF] " << path << ": meta=" << meta; From 0d7f08515a93d0600557166b17ecb3d42a5a4921 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 18 Jul 2023 20:22:57 +0200 Subject: [PATCH 015/571] More metadata checks --- .../categorizer/pcmaudio_categorizer.cpp | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp index 989a6f6e1..30592c8e5 100644 --- a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp +++ b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp @@ -104,6 +104,28 @@ struct pcmaudio_metadata { // uint32_t samples_per_second; auto operator<=>(pcmaudio_metadata const&) const = default; + + bool check() const { + if (!(bits_per_sample == 8 || bits_per_sample == 16 || + bits_per_sample == 20 || bits_per_sample == 24 || + bits_per_sample == 32)) { + return false; + } + + if (bits_per_sample == 8 && bytes_per_sample != 1) { + return false; + } + + if (bits_per_sample == 16 && bytes_per_sample != 2) { + return false; + } + + if (bits_per_sample == 32 && bytes_per_sample != 4) { + return false; + } + + return bytes_per_sample == 3 || bytes_per_sample == 4; + } }; std::ostream& operator<<(std::ostream& os, pcmaudio_metadata const& m) { @@ -292,6 +314,11 @@ bool pcmaudio_categorizer_::check_aiff( return false; } + if (!meta.check()) { + LOG_WARN << "[AIFF] " << path << ": metadata check failed: " << meta; + return false; + } + meta_valid = true; LOG_TRACE << "[AIFF] " << path << ": meta=" << meta; @@ -505,6 +532,11 @@ bool pcmaudio_categorizer_::check_caf( assert(meta.bytes_per_sample > 0); + if (!meta.check()) { + LOG_WARN << "[CAF] " << path << ": metadata check failed: " << meta; + return false; + } + meta_valid = true; LOG_TRACE << "[CAF] " << path << ": meta=" << meta; From 99adfdbf228d65c7253426b9c4393256ec2ef531 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Wed, 19 Jul 2023 01:18:07 +0200 Subject: [PATCH 016/571] Clean up pcmaudio categorizer and add wav-like formats --- .../categorizer/pcmaudio_categorizer.cpp | 514 ++++++++++++++---- 1 file changed, 408 insertions(+), 106 deletions(-) diff --git a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp index 30592c8e5..4c9ad5d7a 100644 --- a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp +++ b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp @@ -106,6 +106,12 @@ struct pcmaudio_metadata { auto operator<=>(pcmaudio_metadata const&) const = default; bool check() const { + // make sure we're supporting a reasonable subset + + if (number_of_channels == 0) { + return false; + } + if (!(bits_per_sample == 8 || bits_per_sample == 16 || bits_per_sample == 20 || bits_per_sample == 24 || bits_per_sample == 32)) { @@ -120,12 +126,194 @@ struct pcmaudio_metadata { return false; } + if ((bits_per_sample == 20 || bits_per_sample == 24) && + !(bytes_per_sample == 3 || bytes_per_sample == 4)) { + return false; + } + if (bits_per_sample == 32 && bytes_per_sample != 4) { return false; } - return bytes_per_sample == 3 || bytes_per_sample == 4; + return true; + } +}; + +template +struct endian; + +template <> +struct endian { + template + static T convert(T x) { + return folly::Endian::big(x); + } +}; + +template <> +struct endian { + template + static T convert(T x) { + return folly::Endian::little(x); + } +}; + +struct WavPolicy { + using SizeType = uint32_t; + static constexpr bool const size_includes_header{false}; + static constexpr size_t const id_size{4}; + static constexpr size_t const file_header_size{12}; + static constexpr size_t const chunk_header_size{8}; + static constexpr std::string_view const format_name{"WAV"}; + static constexpr std::string_view const file_header_id{"RIFF"}; + static constexpr std::string_view const wave_id{"WAVE"}; + static constexpr std::string_view const fmt_id{"fmt "}; + static constexpr std::string_view const data_id{"data"}; +}; + +struct Wav64Policy { + using SizeType = uint64_t; + static constexpr bool const size_includes_header{true}; + static constexpr size_t const id_size{16}; + static constexpr size_t const file_header_size{40}; + static constexpr size_t const chunk_header_size{24}; + static constexpr std::string_view const format_name{"WAV64"}; + static constexpr std::string_view const file_header_id{ + "riff\x2e\x91\xcf\x11\xa5\xd6\x28\xdb\x04\xc1\x00\x00", id_size}; + static constexpr std::string_view const wave_id{ + "wave\xf3\xac\xd3\x11\x8c\xd1\x00\xc0\x4f\x8e\xdb\x8a", id_size}; + static constexpr std::string_view const fmt_id{ + "fmt \xf3\xac\xd3\x11\x8c\xd1\x00\xc0\x4f\x8e\xdb\x8a", id_size}; + static constexpr std::string_view const data_id{ + "data\xf3\xac\xd3\x11\x8c\xd1\x00\xc0\x4f\x8e\xdb\x8a", id_size}; +}; + +template +class iff_parser final { + public: + struct chunk { + ChunkHeaderType header; + size_t pos; + + bool is(std::string_view id) const { + assert(sizeof(header.id) == id.size()); + return std::memcmp(header.id, id.data(), sizeof(header.id)) == 0; + } + + std::string_view id() const { + return std::string_view(header.id, sizeof(header.id)); + } + + std::string_view fourcc() const { + static_assert(sizeof(header.id) >= 4); + return std::string_view(header.id, 4); + } + + size_t size() const { return header.size; } + }; + + iff_parser(logger& lgr, std::string_view name, fs::path const& path, + std::span data, size_t pos) + : LOG_PROXY_INIT(lgr) + , data_{data} + , name_{name} + , path_{path} + , pos_{pos} {} + + std::optional next_chunk() { + std::optional c; + + if (pos_ + sizeof(ChunkHeaderType) <= data_.size()) { + c.emplace(); + + DWARFS_CHECK(read(c->header, pos_), "iff_parser::read failed"); + c->header.size = endian::convert(c->header.size); + c->pos = pos_; + + if constexpr (IsCaf) { + if (c->header.size == + std::numeric_limitsheader.size)>::max() && + c->is("data")) { + c->header.size = data_.size() - (pos_ + sizeof(ChunkHeaderType)); + } + } + + if constexpr (!SizeIncludesHeader) { + pos_ += sizeof(ChunkHeaderType); + } + + pos_ += c->header.size; + + if (pos_ > data_.size()) { + LOG_WARN << "[" << name_ << "] " << path_ + << ": unexpected end of file (pos=" << pos_ + << ", hdr.size=" << c->header.size << ", end=" << data_.size() + << ")"; + c.reset(); + } + + LOG_TRACE << "[" << name_ << "] " << path_ << ": `" << c->fourcc() + << "` (len=" << c->size() << ")"; + } + + return c; + } + + template + bool read(T& storage, chunk const& c) const { + return read(storage, c, sizeof(storage)); + } + + template + bool read(T& storage, chunk const& c, size_t len) const { + assert(len <= c.size()); + return read(storage, c.pos + sizeof(ChunkHeaderType), len); + } + + template + bool read_file_header(T& storage) const { + return read(storage, 0, sizeof(storage)); + } + + bool expected_size(chunk c, size_t expected_size) const { + if (c.size() == expected_size) { + return true; + } + + LOG_WARN << "[" << name_ << "] " << path_ << ": unexpected size for `" + << c.fourcc() << "` chunk: " << c.size() << " (expected " + << expected_size << ")"; + + return false; + } + + private: + template + bool read(T& storage, size_t pos) const { + return read(storage, pos, sizeof(storage)); } + + template + bool read(T& storage, size_t pos, size_t len) const { + assert(len <= sizeof(storage)); + + if (pos + len <= data_.size()) { + std::memcpy(&storage, data_.data() + pos, len); + return true; + } + + LOG_WARN << "[" << name_ << "] " << path_ << ": unexpected end of file"; + + return false; + } + + LOG_PROXY_DECL(LoggerPolicy); + std::span data_; + std::string_view name_; + fs::path const& path_; + size_t pos_; }; std::ostream& operator<<(std::ostream& os, pcmaudio_metadata const& m) { @@ -211,6 +399,11 @@ class pcmaudio_categorizer_ final : public pcmaudio_categorizer_base { std::span data, category_mapper const& mapper) const; + template + bool check_wav_like(inode_fragments& frag, fs::path const& path, + std::span data, + category_mapper const& mapper) const; + LOG_PROXY_DECL(LoggerPolicy); folly::Synchronized mutable meta_; }; @@ -235,7 +428,13 @@ bool pcmaudio_categorizer_::check_aiff( FOLLY_PACK_PUSH - struct chk_hdr_t { + struct file_hdr_t { + char id[4]; + uint32_t size; + char form[4]; + } FOLLY_PACK_ATTR; + + struct chunk_hdr_t { char id[4]; uint32_t size; } FOLLY_PACK_ATTR; @@ -254,46 +453,32 @@ bool pcmaudio_categorizer_::check_aiff( FOLLY_PACK_POP - static_assert(sizeof(chk_hdr_t) == 8); + static_assert(sizeof(chunk_hdr_t) == 8); static_assert(sizeof(comm_chk_t) == 8); static_assert(sizeof(ssnd_chk_t) == 8); + iff_parser parser( + LOG_GET_LOGGER, "AIFF", path, data, sizeof(file_hdr_t)); + bool meta_valid{false}; uint32_t num_sample_frames; pcmaudio_metadata meta; - size_t pos = 12; - chk_hdr_t chk_hdr; - - while (pos + sizeof(chk_hdr) <= data.size()) { - std::memcpy(&chk_hdr, data.data() + pos, sizeof(chk_hdr)); - uint32_t chk_size = folly::Endian::big(chk_hdr.size); - - LOG_TRACE << "[AIFF] " << path << ": " << std::string_view(chk_hdr.id, 4) - << " (len=" << chk_size << ")"; - if (pos + sizeof(chk_hdr) + chk_size > data.size()) { - LOG_WARN << "[AIFF] " << path << ": unexpected end of file"; - // corrupt AIFF? -> skip - return false; - } - - if (std::memcmp(chk_hdr.id, "COMM", 4) == 0) { - if (chk_size != 18) { - LOG_WARN << "[AIFF] " << path - << ": unexpected size for COMM chunk: " << chk_size - << " (expected 18)"; - // corrupt AIFF? -> skip + while (auto chunk = parser.next_chunk()) { + if (chunk->is("COMM")) { + if (parser.expected_size(*chunk, 18)) { return false; } if (meta_valid) { - LOG_WARN << "[AIFF] " << path << ": unexpected second COMM chunk"; - // corrupt AIFF? -> skip + LOG_WARN << "[AIFF] " << path << ": unexpected second `COMM` chunk"; return false; } comm_chk_t comm; - std::memcpy(&comm, data.data() + pos + sizeof(chk_hdr), sizeof(comm)); + if (!parser.read(comm, *chunk)) { + return false; + } meta.sample_endianness = endianness::BIG; meta.sample_signedness = signedness::SIGNED; @@ -303,17 +488,6 @@ bool pcmaudio_categorizer_::check_aiff( meta.number_of_channels = folly::Endian::big(comm.num_chan); num_sample_frames = folly::Endian::big(comm.num_sample_frames); - if (meta.bits_per_sample < 8 || meta.bits_per_sample > 32) { - LOG_WARN << "[AIFF] " << path - << ": unsupported bits per sample: " << meta.bits_per_sample; - return false; - } - - if (meta.number_of_channels == 0) { - LOG_WARN << "[AIFF] " << path << ": file has no audio channels"; - return false; - } - if (!meta.check()) { LOG_WARN << "[AIFF] " << path << ": metadata check failed: " << meta; return false; @@ -322,28 +496,31 @@ bool pcmaudio_categorizer_::check_aiff( meta_valid = true; LOG_TRACE << "[AIFF] " << path << ": meta=" << meta; - } else if (std::memcmp(chk_hdr.id, "SSND", 4) == 0) { + } else if (chunk->is("SSND")) { if (!meta_valid) { - LOG_WARN << "[AIFF] " << path << ": got SSND chunk without COMM chunk"; - // corrupt AIFF? -> skip + LOG_WARN << "[AIFF] " << path + << ": got `SSND` chunk without `COMM` chunk"; return false; } ssnd_chk_t ssnd; - std::memcpy(&ssnd, data.data() + pos + sizeof(chk_hdr), sizeof(ssnd)); + if (!parser.read(ssnd, *chunk)) { + return false; + } + ssnd.offset = folly::Endian::big(ssnd.offset); ssnd.block_size = folly::Endian::big(ssnd.block_size); - size_t pcm_start = pos + sizeof(chk_hdr) + sizeof(ssnd) + ssnd.offset; + size_t pcm_start = + chunk->pos + sizeof(chunk_hdr_t) + sizeof(ssnd) + ssnd.offset; size_t pcm_length = num_sample_frames * (meta.number_of_channels * meta.bytes_per_sample); - if (sizeof(ssnd) + ssnd.offset + pcm_length > chk_size) { + if (sizeof(ssnd) + ssnd.offset + pcm_length > chunk->size()) { LOG_WARN << "[AIFF] " << path << ": SSND invalid chunk size (offset=" << ssnd.offset - << ", pcm_len=" << pcm_length << ", chk_size" << chk_size + << ", pcm_len=" << pcm_length << ", chk_size" << chunk->size() << ")"; - // corrupt AIFF? -> skip return false; } @@ -362,8 +539,6 @@ bool pcmaudio_categorizer_::check_aiff( return true; } - - pos += sizeof(chk_hdr) + chk_size; } return false; @@ -380,11 +555,12 @@ bool pcmaudio_categorizer_::check_caf( FOLLY_PACK_PUSH struct caff_hdr_t { + char id[4]; uint16_t version; uint16_t flags; } FOLLY_PACK_ATTR; - struct chk_hdr_t { + struct chunk_hdr_t { char id[4]; uint64_t size; } FOLLY_PACK_ATTR; @@ -405,8 +581,8 @@ bool pcmaudio_categorizer_::check_caf( FOLLY_PACK_POP - static_assert(sizeof(caff_hdr_t) == 4); - static_assert(sizeof(chk_hdr_t) == 12); + static_assert(sizeof(caff_hdr_t) == 8); + static_assert(sizeof(chunk_hdr_t) == 12); static_assert(sizeof(format_chk_t) == 32); static_assert(sizeof(data_chk_t) == 4); @@ -414,8 +590,14 @@ bool pcmaudio_categorizer_::check_caf( static constexpr uint32_t const kCAFLinearPCMFormatFlagIsLittleEndian{1L << 1}; + iff_parser parser( + LOG_GET_LOGGER, "CAF", path, data, sizeof(caff_hdr_t)); + caff_hdr_t caff_hdr; - std::memcpy(&caff_hdr, data.data() + 4, sizeof(caff_hdr)); + if (!parser.read_file_header(caff_hdr)) { + return false; + } + caff_hdr.version = folly::Endian::big(caff_hdr.version); caff_hdr.flags = folly::Endian::big(caff_hdr.flags); @@ -428,44 +610,22 @@ bool pcmaudio_categorizer_::check_caf( bool meta_valid{false}; pcmaudio_metadata meta; - size_t pos = 8; - chk_hdr_t chk_hdr; - - while (pos + sizeof(chk_hdr) <= data.size()) { - std::memcpy(&chk_hdr, data.data() + pos, sizeof(chk_hdr)); - uint64_t chk_size = folly::Endian::big(chk_hdr.size); - - LOG_TRACE << "[CAF] " << path << ": " << std::string_view(chk_hdr.id, 4) - << " (len=" << chk_size << ")"; - if (chk_size == std::numeric_limits::max() && - std::memcmp(chk_hdr.id, "data", 4) == 0) { - chk_size = data.size() - (pos + sizeof(chk_hdr)); - } - - if (pos + sizeof(chk_hdr) + chk_size > data.size()) { - LOG_WARN << "[CAF] " << path << ": unexpected end of file"; - // corrupt CAF? -> skip - return false; - } - - if (std::memcmp(chk_hdr.id, "desc", 4) == 0) { - if (chk_size != sizeof(format_chk_t)) { - LOG_WARN << "[CAF] " << path - << ": unexpected size for desc chunk: " << chk_size - << " (expected " << sizeof(format_chk_t) << ")"; - // corrupt CAF? -> skip + while (auto chunk = parser.next_chunk()) { + if (chunk->is("desc")) { + if (!parser.expected_size(*chunk, sizeof(format_chk_t))) { return false; } if (meta_valid) { - LOG_WARN << "[CAF] " << path << ": unexpected second desc chunk"; - // corrupt CAF? -> skip + LOG_WARN << "[CAF] " << path << ": unexpected second `desc` chunk"; return false; } format_chk_t fmt; - std::memcpy(&fmt, data.data() + pos + sizeof(chk_hdr), sizeof(fmt)); + if (!parser.read(fmt, *chunk)) { + return false; + } if (std::memcmp(fmt.format_id, "lpcm", 4) != 0) { // TODO: alaw, ulaw? @@ -499,17 +659,6 @@ bool pcmaudio_categorizer_::check_caf( meta.bits_per_sample = folly::Endian::big(fmt.bits_per_channel); meta.number_of_channels = folly::Endian::big(fmt.channels_per_frame); - if (meta.bits_per_sample < 8 || meta.bits_per_sample > 32) { - LOG_WARN << "[CAF] " << path - << ": unsupported bits per sample: " << meta.bits_per_sample; - return false; - } - - if (meta.number_of_channels == 0) { - LOG_WARN << "[CAF] " << path << ": file has no audio channels"; - return false; - } - if (fmt.bytes_per_packet == 0) { LOG_WARN << "[CAF] " << path << ": bytes per packet is zero"; return false; @@ -540,22 +689,21 @@ bool pcmaudio_categorizer_::check_caf( meta_valid = true; LOG_TRACE << "[CAF] " << path << ": meta=" << meta; - } else if (std::memcmp(chk_hdr.id, "data", 4) == 0) { + } else if (chunk->is("data")) { if (!meta_valid) { - LOG_WARN << "[CAF] " << path << ": got data chunk without desc chunk"; - // corrupt CAF? -> skip + LOG_WARN << "[CAF] " << path + << ": got `data` chunk without `desc` chunk"; return false; } - size_t pcm_start = pos + sizeof(chk_hdr) + sizeof(data_chk_t); - size_t pcm_length = chk_size - sizeof(data_chk_t); + size_t pcm_start = chunk->pos + sizeof(chunk_hdr_t) + sizeof(data_chk_t); + size_t pcm_length = chunk->size() - sizeof(data_chk_t); if (pcm_length % (meta.number_of_channels * meta.bytes_per_sample)) { LOG_WARN << "[CAF] " << path - << ": data chunk size mismatch (pcm_len=" << pcm_length + << ": `data` chunk size mismatch (pcm_len=" << pcm_length << ", #chan=" << meta.number_of_channels << ", bytes_per_sample=" << meta.bytes_per_sample << ")"; - // corrupt CAF? -> skip return false; } @@ -574,8 +722,6 @@ bool pcmaudio_categorizer_::check_caf( return true; } - - pos += sizeof(chk_hdr) + chk_size; } return false; @@ -585,19 +731,175 @@ template bool pcmaudio_categorizer_::check_wav( inode_fragments& frag, fs::path const& path, std::span data, category_mapper const& mapper) const { - if (std::memcmp(data.data(), "RIFF", 4) != 0) { - return false; - } - return false; + return check_wav_like(frag, path, data, mapper); } template bool pcmaudio_categorizer_::check_wav64( inode_fragments& frag, fs::path const& path, std::span data, category_mapper const& mapper) const { - if (std::memcmp(data.data(), "riff", 4) != 0) { + return check_wav_like(frag, path, data, mapper); +} + +template +template +bool pcmaudio_categorizer_::check_wav_like( + inode_fragments& frag, fs::path const& path, std::span data, + category_mapper const& mapper) const { + if (std::memcmp(data.data(), FormatPolicy::file_header_id.data(), + FormatPolicy::id_size) != 0) { return false; } + + FOLLY_PACK_PUSH + + struct file_hdr_t { + char id[FormatPolicy::id_size]; + typename FormatPolicy::SizeType size; + char form[FormatPolicy::id_size]; + } FOLLY_PACK_ATTR; + + struct chunk_hdr_t { + char id[FormatPolicy::id_size]; + typename FormatPolicy::SizeType size; + } FOLLY_PACK_ATTR; + + struct fmt_chunk_t { + uint16_t format_code; + uint16_t num_channels; + uint32_t samples_per_sec; + uint32_t avg_bytes_per_sec; + uint16_t block_align; + uint16_t bits_per_sample; + uint16_t ext_size; + uint16_t valid_bits_per_sample; + uint32_t channel_mask; + uint16_t sub_format_code; + uint8_t guid_remainder[14]; + } FOLLY_PACK_ATTR; + + FOLLY_PACK_POP + + static_assert(sizeof(file_hdr_t) == FormatPolicy::file_header_size); + static_assert(sizeof(chunk_hdr_t) == FormatPolicy::chunk_header_size); + + static constexpr uint16_t const WAVE_FORMAT_PCM{0x0001}; + static constexpr uint16_t const WAVE_FORMAT_EXTENSIBLE{0xFFFE}; + + iff_parser + parser(LOG_GET_LOGGER, FormatPolicy::format_name, path, data, + sizeof(file_hdr_t)); + + file_hdr_t file_header; + if (!parser.read_file_header(file_header)) { + return false; + } + + if (std::memcmp(file_header.form, FormatPolicy::wave_id.data(), + FormatPolicy::id_size) != 0) { + return false; + } + + bool meta_valid{false}; + pcmaudio_metadata meta; + + while (auto chunk = parser.next_chunk()) { + if (chunk->is(FormatPolicy::fmt_id)) { + if (chunk->size() != 16 && chunk->size() != 18 && chunk->size() != 40) { + LOG_WARN << "[" << FormatPolicy::format_name << "] " << path + << ": unexpected size for `" << chunk->fourcc() + << "` chunk: " << chunk->size() << " (expected 16, 18, 40)"; + return false; + } + + if (meta_valid) { + LOG_WARN << "[" << FormatPolicy::format_name << "] " << path + << ": unexpected second `" << chunk->fourcc() << "` chunk"; + return false; + } + + fmt_chunk_t fmt; + if (!parser.read(fmt, *chunk, chunk->size())) { + return false; + } + + fmt.format_code = folly::Endian::little(fmt.format_code); + fmt.num_channels = folly::Endian::little(fmt.num_channels); + fmt.samples_per_sec = folly::Endian::little(fmt.samples_per_sec); + fmt.avg_bytes_per_sec = folly::Endian::little(fmt.avg_bytes_per_sec); + fmt.block_align = folly::Endian::little(fmt.block_align); + fmt.bits_per_sample = folly::Endian::little(fmt.bits_per_sample); + if (chunk->size() == 40) { + fmt.valid_bits_per_sample = + folly::Endian::little(fmt.valid_bits_per_sample); + fmt.sub_format_code = folly::Endian::little(fmt.sub_format_code); + } else { + fmt.sub_format_code = 0; + } + + if (!(fmt.format_code == WAVE_FORMAT_PCM || + (fmt.format_code == WAVE_FORMAT_EXTENSIBLE && chunk->size() == 40 && + fmt.sub_format_code == WAVE_FORMAT_PCM))) { + LOG_TRACE << "[" << FormatPolicy::format_name << "] " << path + << ": unsupported format: " << fmt.format_code << "/" + << fmt.sub_format_code; + return false; + } + + meta.sample_endianness = endianness::LITTLE; + meta.sample_signedness = + fmt.bits_per_sample > 8 ? signedness::SIGNED : signedness::UNSIGNED; + meta.sample_padding = padding::LSB; + meta.bits_per_sample = fmt.bits_per_sample; + meta.bytes_per_sample = (meta.bits_per_sample + 7) / 8; + meta.number_of_channels = fmt.num_channels; + + if (!meta.check()) { + LOG_WARN << "[" << FormatPolicy::format_name << "] " << path + << ": metadata check failed: " << meta; + return false; + } + + meta_valid = true; + + LOG_TRACE << "[" << FormatPolicy::format_name << "] " << path + << ": meta=" << meta; + } else if (chunk->is(FormatPolicy::data_id)) { + if (!meta_valid) { + LOG_WARN << "[" << FormatPolicy::format_name << "] " << path + << ": got `data` chunk without `fmt ` chunk"; + return false; + } + + size_t pcm_start = chunk->pos + sizeof(chunk_hdr_t); + size_t pcm_length = chunk->size(); + + if (pcm_length % (meta.number_of_channels * meta.bytes_per_sample)) { + LOG_WARN << "[" << FormatPolicy::format_name << "] " << path + << ": `data` chunk size mismatch (pcm_len=" << pcm_length + << ", #chan=" << meta.number_of_channels + << ", bytes_per_sample=" << meta.bytes_per_sample << ")"; + return false; + } + + fragment_category::value_type subcategory = meta_.wlock()->add(meta); + + frag.emplace_back(fragment_category(mapper(METADATA_CATEGORY)), + pcm_start); + frag.emplace_back( + fragment_category(mapper(PCMAUDIO_CATEGORY), subcategory), + pcm_length); + + if (pcm_start + pcm_length < data.size()) { + frag.emplace_back(fragment_category(mapper(METADATA_CATEGORY)), + data.size() - (pcm_start + pcm_length)); + } + + return true; + } + } + return false; } From 2544195abf161c7f7922f7ed3dcc6c33198a498f Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Wed, 19 Jul 2023 01:48:16 +0200 Subject: [PATCH 017/571] Check for invalid chunk size in WAV64 --- src/dwarfs/categorizer/pcmaudio_categorizer.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp index 4c9ad5d7a..cb9736e12 100644 --- a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp +++ b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp @@ -240,7 +240,14 @@ class iff_parser final { } } - if constexpr (!SizeIncludesHeader) { + if constexpr (SizeIncludesHeader) { + if (c->header.size < sizeof(ChunkHeaderType)) { + LOG_WARN << "[" << name_ << "] " << path_ + << ": invalid chunk size: " << c->header.size; + c.reset(); + return c; + } + } else { pos_ += sizeof(ChunkHeaderType); } @@ -252,6 +259,7 @@ class iff_parser final { << ", hdr.size=" << c->header.size << ", end=" << data_.size() << ")"; c.reset(); + return c; } LOG_TRACE << "[" << name_ << "] " << path_ << ": `" << c->fourcc() From d6d279fbb459e95356e1c2f363aa2e73c0d4a6a4 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Wed, 19 Jul 2023 02:03:22 +0200 Subject: [PATCH 018/571] Fix AIFF parser bug, reduce min size --- src/dwarfs/categorizer/pcmaudio_categorizer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp index cb9736e12..ff854f191 100644 --- a/src/dwarfs/categorizer/pcmaudio_categorizer.cpp +++ b/src/dwarfs/categorizer/pcmaudio_categorizer.cpp @@ -48,7 +48,7 @@ namespace { constexpr std::string_view const METADATA_CATEGORY{"metadata"}; constexpr std::string_view const PCMAUDIO_CATEGORY{"pcmaudio"}; -constexpr size_t const MIN_PCMAUDIO_SIZE{64}; +constexpr size_t const MIN_PCMAUDIO_SIZE{32}; enum class endianness : uint8_t { BIG, @@ -474,7 +474,7 @@ bool pcmaudio_categorizer_::check_aiff( while (auto chunk = parser.next_chunk()) { if (chunk->is("COMM")) { - if (parser.expected_size(*chunk, 18)) { + if (!parser.expected_size(*chunk, 18)) { return false; } From 62e3805b13bb804ba2e9e6ba66c8b12d6c390e2f Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Wed, 19 Jul 2023 14:39:12 +0200 Subject: [PATCH 019/571] Update categorizer_manager interface --- include/dwarfs/categorizer.h | 6 ++---- src/dwarfs/categorizer.cpp | 18 +++--------------- src/dwarfs/inode_manager.cpp | 4 ---- 3 files changed, 5 insertions(+), 23 deletions(-) diff --git a/include/dwarfs/categorizer.h b/include/dwarfs/categorizer.h index 0d0847280..d5ff747f1 100644 --- a/include/dwarfs/categorizer.h +++ b/include/dwarfs/categorizer.h @@ -103,9 +103,7 @@ class categorizer_job { explicit operator bool() const { return impl_ != nullptr; } - bool has_multi_fragment_sequential_categorizers() const { - return impl_->has_multi_fragment_sequential_categorizers(); - } + bool best_result_found() const { return impl_->best_result_found(); } class impl { public: @@ -115,7 +113,7 @@ class categorizer_job { virtual void categorize_random_access(std::span data) = 0; virtual void categorize_sequential(std::span data) = 0; virtual inode_fragments result() = 0; - virtual bool has_multi_fragment_sequential_categorizers() const = 0; + virtual bool best_result_found() const = 0; }; private: diff --git a/src/dwarfs/categorizer.cpp b/src/dwarfs/categorizer.cpp index 781939415..fc3ae7106 100644 --- a/src/dwarfs/categorizer.cpp +++ b/src/dwarfs/categorizer.cpp @@ -45,7 +45,6 @@ class categorizer_manager_private : public categorizer_manager::impl { categorizers() const = 0; virtual fragment_category::value_type category(std::string_view cat) const = 0; - virtual bool has_multi_fragment_sequential_categorizers() const = 0; }; template @@ -63,7 +62,7 @@ class categorizer_job_ final : public categorizer_job::impl { void categorize_random_access(std::span data) override; void categorize_sequential(std::span data) override; inode_fragments result() override; - bool has_multi_fragment_sequential_categorizers() const override; + bool best_result_found() const override; private: LOG_PROXY_DECL(LoggerPolicy); @@ -157,9 +156,8 @@ inode_fragments categorizer_job_::result() { } template -bool categorizer_job_< - LoggerPolicy>::has_multi_fragment_sequential_categorizers() const { - return mgr_.has_multi_fragment_sequential_categorizers(); +bool categorizer_job_::best_result_found() const { + return is_global_best_; } categorizer_job::categorizer_job() = default; @@ -192,10 +190,6 @@ class categorizer_manager_ final : public categorizer_manager_private { return it->second; } - bool has_multi_fragment_sequential_categorizers() const override { - return has_multi_fragment_sequential_categorizers_; - } - private: void add_category(std::string_view cat, size_t categorizer_index) { if (catmap_.emplace(cat, categories_.size()).second) { @@ -210,7 +204,6 @@ class categorizer_manager_ final : public categorizer_manager_private { std::vector> categorizers_; std::vector> categories_; std::unordered_map catmap_; - bool has_multi_fragment_sequential_categorizers_{false}; }; template @@ -220,11 +213,6 @@ void categorizer_manager_::add( add_category(c, categorizers_.size()); } - if (!c->is_single_fragment() && - dynamic_cast(c.get())) { - has_multi_fragment_sequential_categorizers_ = true; - } - categorizers_.emplace_back(std::move(c)); } diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index 1290365e8..44eb7e2d9 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -155,10 +155,6 @@ class inode_ : public inode { if (mm) { if (catjob) { - DWARFS_CHECK( - !catjob.has_multi_fragment_sequential_categorizers(), - "multi-fragment sequential categorizers are not supported"); - catjob.set_total_size(mm->size()); catjob.categorize_random_access(mm->span()); } From 3c3d67a2d672a142590159f6aa3be0f09b4874c2 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Wed, 19 Jul 2023 14:39:42 +0200 Subject: [PATCH 020/571] Add chunks vector to single_inode_fragment --- include/dwarfs/inode_fragments.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/dwarfs/inode_fragments.h b/include/dwarfs/inode_fragments.h index 2c54d25a8..7bc9eeb17 100644 --- a/include/dwarfs/inode_fragments.h +++ b/include/dwarfs/inode_fragments.h @@ -31,6 +31,8 @@ #include "dwarfs/fragment_category.h" #include "dwarfs/types.h" +#include "dwarfs/gen-cpp2/metadata_types.h" + namespace dwarfs { class single_inode_fragment { @@ -46,6 +48,7 @@ class single_inode_fragment { private: fragment_category category_; file_off_t length_; + folly::small_vector chunks_; }; class inode_fragments { From 04701f09a9f0b413ac528fd97d5bc9427591c185 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Wed, 19 Jul 2023 14:40:56 +0200 Subject: [PATCH 021/571] Replace optional by plain uint32 + flags field --- src/dwarfs/inode_manager.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index 44eb7e2d9..2bd3900b0 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -99,11 +99,16 @@ class inode_ : public inode { } void set_num(uint32_t num) override { - DWARFS_CHECK(!num_, "attempt to set inode number multiple times"); + DWARFS_CHECK((flags_ & kNumIsValid) == 0, + "attempt to set inode number multiple times"); num_ = num; + flags_ |= kNumIsValid; } - uint32_t num() const override { return num_.value(); } + uint32_t num() const override { + DWARFS_CHECK((flags_ & kNumIsValid) != 0, "inode number is not set"); + return num_; + } uint32_t similarity_hash() const override { assert(similarity_valid_); @@ -234,8 +239,11 @@ class inode_ : public inode { private: // TODO: can we move optional stuff (e.g. nilsimsa_similarity_hash_) out of // here? - std::optional num_; uint32_t similarity_hash_{0}; + static constexpr uint32_t const kNumIsValid{UINT32_C(1) << 0}; + + uint32_t flags_{0}; + uint32_t num_; inode_fragments fragments_; files_vector files_; std::vector chunks_; From 611d1ef28dba79d9a4691cdb2f1017038539f7a6 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Wed, 19 Jul 2023 14:43:02 +0200 Subject: [PATCH 022/571] Refactor similarity handling in inode manager --- src/dwarfs/inode_manager.cpp | 50 +++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 7 deletions(-) diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index 2bd3900b0..3cdc639d7 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -30,10 +30,14 @@ #include #include #include +#include #include #include +#include +#include + #include "dwarfs/categorizer.h" #include "dwarfs/compiler.h" #include "dwarfs/entry.h" @@ -158,6 +162,19 @@ class inode_ : public inode { opts.categorizer_mgr->job(mm ? mm->path().string() : ""); } + /// + /// 1. Run random access categorizers + /// 2. If we *have* a best category already (need a call for that), + /// we can immediately compute similarity hashes for all fragments + /// (or not, if the category is configured not to use similarity) + /// 3. If we *don't* have a best category yet, we can run similarity + /// hashing while running the sequential categorizer(s). + /// 4. If we end up with multiple fragments after all, we might have + /// to re-run similarity hashing. This means we can also drop the + /// multi-fragment sequential categorizer check, as we can just + /// as well support that case. + /// + if (mm) { if (catjob) { catjob.set_total_size(mm->size()); @@ -237,20 +254,39 @@ class inode_ : public inode { inode_fragments const& fragments() const override { return fragments_; } private: - // TODO: can we move optional stuff (e.g. nilsimsa_similarity_hash_) out of - // here? - uint32_t similarity_hash_{0}; + using similarity_map_type = + folly::sorted_vector_map>; + static constexpr uint32_t const kNumIsValid{UINT32_C(1) << 0}; uint32_t flags_{0}; uint32_t num_; inode_fragments fragments_; files_vector files_; - std::vector chunks_; - nilsimsa::hash_type nilsimsa_similarity_hash_; + + std::variant< + // in case of no hashes at all + std::monostate, + + // in case of only a single fragment + nilsimsa::hash_type, // 32 bytes + uint32_t, // 4 bytes + + // in case of multiple fragments + similarity_map_type // 24 bytes + > + similarity_; + + // OLDE: + uint32_t similarity_hash_{0}; // TODO: remove (move to similarity_) + std::vector chunks_; // TODO: remove (part of fragments_ now) + nilsimsa::hash_type + nilsimsa_similarity_hash_; // TODO: remove (move to similarity_) #ifndef NDEBUG - bool similarity_valid_{false}; - bool nilsimsa_valid_{false}; + // no longer needed because we now know which are valid + bool similarity_valid_{false}; // TODO: remove + bool nilsimsa_valid_{false}; // TODO: remove #endif }; From 34beffceb3b34d3934c29c2a8f07412426938e01 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Wed, 19 Jul 2023 17:47:38 +0200 Subject: [PATCH 023/571] Integrate categorizers into inode manager --- CMakeLists.txt | 2 + include/dwarfs/categorizer.h | 10 + include/dwarfs/category_parser.h | 43 +++ include/dwarfs/contextual_option.h | 158 ++++++++++ include/dwarfs/file_scanner.h | 1 - include/dwarfs/fragment_category.h | 19 ++ include/dwarfs/fragment_order_parser.h | 37 +++ include/dwarfs/inode.h | 1 - include/dwarfs/inode_fragments.h | 7 + include/dwarfs/inode_manager.h | 23 +- include/dwarfs/nilsimsa.h | 5 + include/dwarfs/options.h | 48 ++- include/dwarfs/similarity.h | 5 + src/dwarfs/categorizer.cpp | 27 +- .../incompressible_categorizer.cpp | 7 + src/dwarfs/category_parser.cpp | 56 ++++ src/dwarfs/file_scanner.cpp | 27 +- src/dwarfs/fragment_order_parser.cpp | 127 ++++++++ src/dwarfs/inode_manager.cpp | 275 +++++++++++++----- src/dwarfs/scanner.cpp | 32 +- src/mkdwarfs_main.cpp | 119 ++------ test/dwarfs.cpp | 26 +- test/dwarfs_benchmark.cpp | 2 - 23 files changed, 808 insertions(+), 249 deletions(-) create mode 100644 include/dwarfs/category_parser.h create mode 100644 include/dwarfs/contextual_option.h create mode 100644 include/dwarfs/fragment_order_parser.h create mode 100644 src/dwarfs/category_parser.cpp create mode 100644 src/dwarfs/fragment_order_parser.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2db2ec428..64e343f34 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -359,6 +359,7 @@ list( src/dwarfs/builtin_script.cpp src/dwarfs/cached_block.cpp src/dwarfs/categorizer.cpp + src/dwarfs/category_parser.cpp src/dwarfs/checksum.cpp src/dwarfs/chmod_transformer.cpp src/dwarfs/console_writer.cpp @@ -371,6 +372,7 @@ list( src/dwarfs/filesystem_extractor.cpp src/dwarfs/filesystem_v2.cpp src/dwarfs/filesystem_writer.cpp + src/dwarfs/fragment_order_parser.cpp src/dwarfs/fstypes.cpp src/dwarfs/fs_section.cpp src/dwarfs/global_entry_data.cpp diff --git a/include/dwarfs/categorizer.h b/include/dwarfs/categorizer.h index d5ff747f1..74fe6a2a1 100644 --- a/include/dwarfs/categorizer.h +++ b/include/dwarfs/categorizer.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -124,6 +125,8 @@ class categorizer_manager { public: categorizer_manager(logger& lgr); + static fragment_category default_category(); + void add(std::shared_ptr c) { impl_->add(std::move(c)); } categorizer_job job(std::filesystem::path const& path) const { @@ -134,6 +137,11 @@ class categorizer_manager { return impl_->category_name(c); } + std::optional + category_value(std::string_view name) const { + return impl_->category_value(name); + } + folly::dynamic category_metadata(fragment_category c) const { return impl_->category_metadata(c); } @@ -146,6 +154,8 @@ class categorizer_manager { virtual categorizer_job job(std::filesystem::path const& path) const = 0; virtual std::string_view category_name(fragment_category::value_type c) const = 0; + virtual std::optional + category_value(std::string_view name) const = 0; virtual folly::dynamic category_metadata(fragment_category c) const = 0; }; diff --git a/include/dwarfs/category_parser.h b/include/dwarfs/category_parser.h new file mode 100644 index 000000000..e4dd690aa --- /dev/null +++ b/include/dwarfs/category_parser.h @@ -0,0 +1,43 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include + +#include "dwarfs/fragment_category.h" + +namespace dwarfs { + +class categorizer_manager; + +class category_parser { + public: + category_parser(std::shared_ptr catmgr); + + std::vector parse(std::string_view arg) const; + + private: + std::shared_ptr catmgr_; +}; + +} // namespace dwarfs diff --git a/include/dwarfs/contextual_option.h b/include/dwarfs/contextual_option.h new file mode 100644 index 000000000..8b6176134 --- /dev/null +++ b/include/dwarfs/contextual_option.h @@ -0,0 +1,158 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace dwarfs { + +template +class contextual_option { + public: + using policy_type = Policy; + using context_argument_type = typename policy_type::ContextArgumentType; + using context_type = typename policy_type::ContextType; + using option_type = typename policy_type::OptionType; + + contextual_option() = default; + explicit contextual_option(option_type const& def) + : default_{def} {} + + void set_default(option_type const& val) { default_ = val; } + + void add_contextual(context_type const& ctx, option_type const& val) { + contextual_[ctx] = val; + } + + std::optional + get_optional(context_argument_type const& arg) const { + if constexpr (std::is_same_v) { + return get_optional_impl(arg); + } else { + return get_optional_impl(policy_type::context_from_arg(arg)); + } + } + + option_type get(context_argument_type const& arg) const { + if constexpr (std::is_same_v) { + return get_impl(arg); + } else { + return get_impl(policy_type::context_from_arg(arg)); + } + } + + std::optional get_optional() const { return default_; } + + option_type get() const { return default_.value(); } + + template + bool any_is(T&& pred) const { + for (auto e : contextual_) { + if (pred(e.second)) { + return true; + } + } + return default_ && pred(*default_); + } + + private: + std::optional get_optional_impl(context_type const& ctx) const { + if (auto it = contextual_.find(ctx); it != contextual_.end()) { + return it->second; + } + return default_; + } + + option_type get_impl(context_type const& ctx) const { + if (auto it = contextual_.find(ctx); it != contextual_.end()) { + return it->second; + } + return default_.value(); + } + + std::optional default_; + std::unordered_map contextual_; +}; + +template +class contextual_option_parser { + public: + using option_type = OptionType; + using policy_type = typename option_type::policy_type; + + contextual_option_parser(OptionType& opt, ContextParser const& cp, + OptionParser const& op) + : opt_{opt} + , cp_{cp} + , op_{op} {} + + void parse(std::string_view arg) const { + try { + auto pos = arg.find("::"); + + if (pos == arg.npos) { + opt_.set_default(op_.parse(arg)); + } else { + auto ctx = arg.substr(0, pos); + auto val = op_.parse(arg.substr(pos + 2)); + if constexpr (std::is_same_v< + std::invoke_result_t, + typename option_type::context_type>) { + opt_.add_contextual(cp_.parse(ctx), val); + } else { + for (auto c : cp_.parse(ctx)) { + opt_.add_contextual(c, val); + } + } + } + } catch (std::exception const& e) { + throw std::runtime_error( + fmt::format("failed to parse: {} ({})", arg, e.what())); + } + } + + void parse(std::span list) const { + for (auto const& arg : list) { + parse(arg); + } + } + + void parse(std::span list) const { + for (auto const& arg : list) { + parse(arg); + } + } + + private: + OptionType& opt_; + ContextParser const& cp_; + OptionParser const& op_; +}; + +} // namespace dwarfs diff --git a/include/dwarfs/file_scanner.h b/include/dwarfs/file_scanner.h index 6b8509c4d..a4d05f3c4 100644 --- a/include/dwarfs/file_scanner.h +++ b/include/dwarfs/file_scanner.h @@ -40,7 +40,6 @@ namespace detail { class file_scanner { public: file_scanner(worker_group& wg, os_access& os, inode_manager& im, - inode_options const& ino_opts, std::optional const& hash_algo, progress& prog); void scan(file* p) { impl_->scan(p); } diff --git a/include/dwarfs/fragment_category.h b/include/dwarfs/fragment_category.h index 4bc307ee2..4425bb8e2 100644 --- a/include/dwarfs/fragment_category.h +++ b/include/dwarfs/fragment_category.h @@ -25,6 +25,8 @@ #include #include +#include + namespace dwarfs { class fragment_category { @@ -88,9 +90,26 @@ class fragment_category { return subcategory_; } + auto operator<=>(fragment_category const&) const = default; + + size_t hash() const { + return folly::hash::hash_combine(value_, subcategory_); + } + private: value_type value_{uninitialized}; value_type subcategory_{uninitialized}; }; } // namespace dwarfs + +namespace std { + +template <> +struct hash { + std::size_t operator()(dwarfs::fragment_category const& k) const { + return k.hash(); + } +}; + +} // namespace std diff --git a/include/dwarfs/fragment_order_parser.h b/include/dwarfs/fragment_order_parser.h new file mode 100644 index 000000000..86023bd85 --- /dev/null +++ b/include/dwarfs/fragment_order_parser.h @@ -0,0 +1,37 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#pragma once + +#include + +#include "dwarfs/options.h" + +namespace dwarfs { + +struct fragment_order_parser { + public: + static std::string choices(); + + file_order_options parse(std::string_view arg) const; +}; + +} // namespace dwarfs diff --git a/include/dwarfs/inode.h b/include/dwarfs/inode.h index 703e113cc..b05e67dba 100644 --- a/include/dwarfs/inode.h +++ b/include/dwarfs/inode.h @@ -47,7 +47,6 @@ class inode : public object { using files_vector = folly::small_vector; virtual void set_files(files_vector&& fv) = 0; - virtual void set_similarity_valid(inode_options const& opts) = 0; virtual void scan(mmif* mm, inode_options const& options) = 0; virtual void set_num(uint32_t num) = 0; virtual uint32_t num() const = 0; diff --git a/include/dwarfs/inode_fragments.h b/include/dwarfs/inode_fragments.h index 7bc9eeb17..aa044cdf8 100644 --- a/include/dwarfs/inode_fragments.h +++ b/include/dwarfs/inode_fragments.h @@ -65,10 +65,17 @@ class inode_fragments { std::span span() const { return fragments_; } + size_t size() const { return fragments_.size(); } + bool empty() const { return fragments_.empty(); } void clear() { fragments_.clear(); } + fragment_category get_single_category() const { + assert(fragments_.size() == 1); + return fragments_.at(0).category(); + } + explicit operator bool() const { return !empty(); } std::ostream& diff --git a/include/dwarfs/inode_manager.h b/include/dwarfs/inode_manager.h index 1678fb53c..fc616eba0 100644 --- a/include/dwarfs/inode_manager.h +++ b/include/dwarfs/inode_manager.h @@ -32,27 +32,29 @@ namespace dwarfs { +class file; class inode; class logger; +class os_access; class progress; class script; +class worker_group; -struct file_order_options; +struct inode_options; class inode_manager { public: using inode_cb = std::function const&)>; using order_cb = std::function const&)>; - inode_manager(logger& lgr, progress& prog); + inode_manager(logger& lgr, progress& prog, inode_options const& opts); std::shared_ptr create_inode() { return impl_->create_inode(); } size_t count() const { return impl_->count(); } - void order_inodes(std::shared_ptr