diff --git a/doc/dwarfsextract.md b/doc/dwarfsextract.md index 577e85255..5390ca182 100644 --- a/doc/dwarfsextract.md +++ b/doc/dwarfsextract.md @@ -44,6 +44,17 @@ to disk: case the default is to extract the files to the current directory, or to write the archive data to stdout. +- `--pattern=`*glob-pattern*: + If specified, only extract entries matching the pattern. Can be specified + multiple times, in which case all files matching one or more patterns will + be extracted. Can also be specified without `--pattern` for simplicity. + If prefixed with `i:`, the glob pattern match will be case-insensitive, + e.g. `i:**/*.txt` would match all `.txt`, `.TXT` and `.tXt` files. If you + want the *actual* pattern to start with `i:`, you have to prefix the + pattern with `:`, i.e. `:i:...`. The glob patterns support the wildcards + `*` and `?`, character classes (`[avt]`), ranges (`[a-h]`), complementation + (`[!a-h]`) and globstar (`**`). + - `-O`, `--image-offset=`*value*|`auto`: Specify the byte offset at which the filesystem is located in the image. Use `auto` to detect the offset automatically. This is also the default. diff --git a/include/dwarfs/utility/filesystem_extractor.h b/include/dwarfs/utility/filesystem_extractor.h index 88da00ed3..c9fd3af0f 100644 --- a/include/dwarfs/utility/filesystem_extractor.h +++ b/include/dwarfs/utility/filesystem_extractor.h @@ -30,6 +30,7 @@ namespace dwarfs { +class glob_matcher; class library_dependencies; class logger; class os_access; @@ -72,7 +73,13 @@ class filesystem_extractor { bool extract(reader::filesystem_v2 const& fs, filesystem_extractor_options const& opts = filesystem_extractor_options()) { - return impl_->extract(fs, opts); + return impl_->extract(fs, nullptr, opts); + } + + bool extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher, + filesystem_extractor_options const& opts = + filesystem_extractor_options()) { + return impl_->extract(fs, matcher, opts); } class impl { @@ -84,8 +91,9 @@ class filesystem_extractor { virtual void open_stream(std::ostream& os, std::string const& format) = 0; virtual void open_disk(std::filesystem::path const& output) = 0; virtual void close() = 0; - virtual bool extract(reader::filesystem_v2 const& fs, - filesystem_extractor_options const& opts) = 0; + virtual bool + extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher, + filesystem_extractor_options const& opts) = 0; }; private: diff --git a/src/utility/filesystem_extractor.cpp b/src/utility/filesystem_extractor.cpp index b81cdef6b..7d03af442 100644 --- a/src/utility/filesystem_extractor.cpp +++ b/src/utility/filesystem_extractor.cpp @@ -24,6 +24,7 @@ #include #include #include +#include // This is required to avoid Windows.h being pulled in by libarchive // and polluting our environment with all sorts of shit. @@ -41,6 +42,7 @@ #include #include +#include #include #include #include @@ -188,7 +190,7 @@ class filesystem_extractor_ final : public filesystem_extractor::impl { } } - bool extract(reader::filesystem_v2 const& fs, + bool extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher, filesystem_extractor_options const& opts) override; private: @@ -248,7 +250,8 @@ class filesystem_extractor_ final : public filesystem_extractor::impl { template bool filesystem_extractor_::extract( - reader::filesystem_v2 const& fs, filesystem_extractor_options const& opts) { + reader::filesystem_v2 const& fs, glob_matcher const* matcher, + filesystem_extractor_options const& opts) { DWARFS_CHECK(a_, "filesystem not opened"); auto lr = ::archive_entry_linkresolver_new(); @@ -350,6 +353,23 @@ bool filesystem_extractor_::extract( } }; + std::unordered_set matched_dirs; + + if (matcher) { + fs.walk([&](auto entry) { + if (!entry.inode().is_directory()) { + if (matcher->match(entry.unix_path())) { + while (auto parent = entry.parent()) { + if (!matched_dirs.insert(parent->fs_path()).second) { + break; + } + entry = *parent; + } + } + } + }); + } + fs.walk_data_order([&](auto entry) { // TODO: we can surely early abort walk() somehow if (entry.is_root() || hard_error) { @@ -358,6 +378,23 @@ bool filesystem_extractor_::extract( auto inode = entry.inode(); + if (matcher) { + LOG_TRACE << "checking " << entry.unix_path(); + if (inode.is_directory()) { + if (!matched_dirs.contains(entry.fs_path())) { + LOG_TRACE << "skipping directory " << entry.fs_path(); + // no need to extract this directory + return; + } + } else { + if (!matcher->match(entry.unix_path())) { + LOG_TRACE << "skipping " << entry.fs_path(); + // no match, skip this entry + return; + } + } + } + auto ae = ::archive_entry_new(); auto stbuf = fs.getattr(inode); diff --git a/test/tool_main_test.cpp b/test/tool_main_test.cpp index 6a1b4fc24..0a012e894 100644 --- a/test/tool_main_test.cpp +++ b/test/tool_main_test.cpp @@ -2000,6 +2000,37 @@ TEST(dwarfsextract_test, mtree) { EXPECT_THAT(out, ::testing::HasSubstr("type=file")); } +TEST(dwarfsextract_test, patterns) { + auto mkdt = mkdwarfs_tester::create_empty(); + mkdt.add_test_file_tree(); + ASSERT_EQ(0, mkdt.run({"-i", "/", "-o", "-", "--with-devices"}) != 0) + << mkdt.err(); + auto t = dwarfsextract_tester::create_with_image(mkdt.out()); + ASSERT_EQ(0, t.run({"-i", "image.dwarfs", "-f", "mtree", "**/*.enc", + "{dev,etc,lib,var}/[m-ot-z]*"})) + << t.err(); + auto out = t.out(); + EXPECT_TRUE(out.starts_with("#mtree")) << out; + std::vector const expected{ + "./dev", + "./dev/tty37", + "./etc", + "./etc/netconfig", + "./usr", + "./usr/lib64", + "./usr/lib64/tcl8.6", + "./usr/lib64/tcl8.6/encoding", + "./usr/lib64/tcl8.6/encoding/cp950.enc", + "./usr/lib64/tcl8.6/encoding/iso8859-8.enc", + }; + auto mtree = test::parse_mtree(out); + std::vector actual; + for (auto const& entry : mtree) { + actual.push_back(entry.first); + } + EXPECT_EQ(expected, actual); +} + TEST(dwarfsextract_test, stdout_progress_error) { auto t = dwarfsextract_tester::create_with_image(); EXPECT_NE(0, diff --git a/tools/src/dwarfsextract_main.cpp b/tools/src/dwarfsextract_main.cpp index e830efe1a..6fc4dd1db 100644 --- a/tools/src/dwarfsextract_main.cpp +++ b/tools/src/dwarfsextract_main.cpp @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -77,6 +78,9 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) { ("output,o", po_sys_value(&output), "output file or directory") + ("pattern", + po::value>(), + "only extract files matching these patterns") ("image-offset,O", po::value(&image_offset)->default_value("auto"), "filesystem image offset in bytes") @@ -111,10 +115,17 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) { tool::add_common_options(opts, logopts); + po::positional_options_description pos; + pos.add("pattern", -1); + po::variables_map vm; try { - po::store(po::parse_command_line(argc, argv, opts), vm); + po::store(po::basic_command_line_parser(argc, argv) + .options(opts) + .positional(pos) + .run(), + vm); po::notify(vm); } catch (po::error const& e) { iol.err << "error: " << e.what() << "\n"; @@ -141,6 +152,13 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) { return 0; } + std::unique_ptr matcher; + + if (vm.count("pattern")) { + matcher = std::make_unique( + vm["pattern"].as>()); + } + int rv = 0; try { @@ -214,7 +232,7 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) { }; } - rv = fsx.extract(fs, fsx_opts) ? 0 : 2; + rv = fsx.extract(fs, matcher.get(), fsx_opts) ? 0 : 2; fsx.close();