Skip to content

Commit

Permalink
feat: support extracting only files matching glob patterns (fixes gh #…
Browse files Browse the repository at this point in the history
  • Loading branch information
mhx committed Nov 18, 2024
1 parent f40fc5e commit 9e7760b
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 7 deletions.
11 changes: 11 additions & 0 deletions doc/dwarfsextract.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,17 @@ to disk:
case the default is to extract the files to the current directory, or
to write the archive data to stdout.

- `--pattern=`*glob-pattern*:
If specified, only extract entries matching the pattern. Can be specified
multiple times, in which case all files matching one or more patterns will
be extracted. Can also be specified without `--pattern` for simplicity.
If prefixed with `i:`, the glob pattern match will be case-insensitive,
e.g. `i:**/*.txt` would match all `.txt`, `.TXT` and `.tXt` files. If you
want the *actual* pattern to start with `i:`, you have to prefix the
pattern with `:`, i.e. `:i:...`. The glob patterns support the wildcards
`*` and `?`, character classes (`[avt]`), ranges (`[a-h]`), complementation
(`[!a-h]`) and globstar (`**`).

- `-O`, `--image-offset=`*value*|`auto`:
Specify the byte offset at which the filesystem is located in the image.
Use `auto` to detect the offset automatically. This is also the default.
Expand Down
14 changes: 11 additions & 3 deletions include/dwarfs/utility/filesystem_extractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

namespace dwarfs {

class glob_matcher;
class library_dependencies;
class logger;
class os_access;
Expand Down Expand Up @@ -72,7 +73,13 @@ class filesystem_extractor {
bool extract(reader::filesystem_v2 const& fs,
filesystem_extractor_options const& opts =
filesystem_extractor_options()) {
return impl_->extract(fs, opts);
return impl_->extract(fs, nullptr, opts);
}

bool extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher,
filesystem_extractor_options const& opts =
filesystem_extractor_options()) {
return impl_->extract(fs, matcher, opts);
}

class impl {
Expand All @@ -84,8 +91,9 @@ class filesystem_extractor {
virtual void open_stream(std::ostream& os, std::string const& format) = 0;
virtual void open_disk(std::filesystem::path const& output) = 0;
virtual void close() = 0;
virtual bool extract(reader::filesystem_v2 const& fs,
filesystem_extractor_options const& opts) = 0;
virtual bool
extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher,
filesystem_extractor_options const& opts) = 0;
};

private:
Expand Down
41 changes: 39 additions & 2 deletions src/utility/filesystem_extractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <memory>
#include <mutex>
#include <thread>
#include <unordered_set>

// This is required to avoid Windows.h being pulled in by libarchive
// and polluting our environment with all sorts of shit.
Expand All @@ -41,6 +42,7 @@

#include <dwarfs/file_stat.h>
#include <dwarfs/fstypes.h>
#include <dwarfs/glob_matcher.h>
#include <dwarfs/library_dependencies.h>
#include <dwarfs/logger.h>
#include <dwarfs/os_access.h>
Expand Down Expand Up @@ -188,7 +190,7 @@ class filesystem_extractor_ final : public filesystem_extractor::impl {
}
}

bool extract(reader::filesystem_v2 const& fs,
bool extract(reader::filesystem_v2 const& fs, glob_matcher const* matcher,
filesystem_extractor_options const& opts) override;

private:
Expand Down Expand Up @@ -248,7 +250,8 @@ class filesystem_extractor_ final : public filesystem_extractor::impl {

template <typename LoggerPolicy>
bool filesystem_extractor_<LoggerPolicy>::extract(
reader::filesystem_v2 const& fs, filesystem_extractor_options const& opts) {
reader::filesystem_v2 const& fs, glob_matcher const* matcher,
filesystem_extractor_options const& opts) {
DWARFS_CHECK(a_, "filesystem not opened");

auto lr = ::archive_entry_linkresolver_new();
Expand Down Expand Up @@ -350,6 +353,23 @@ bool filesystem_extractor_<LoggerPolicy>::extract(
}
};

std::unordered_set<std::filesystem::path> matched_dirs;

if (matcher) {
fs.walk([&](auto entry) {
if (!entry.inode().is_directory()) {
if (matcher->match(entry.unix_path())) {
while (auto parent = entry.parent()) {
if (!matched_dirs.insert(parent->fs_path()).second) {
break;
}
entry = *parent;
}
}
}
});
}

fs.walk_data_order([&](auto entry) {
// TODO: we can surely early abort walk() somehow
if (entry.is_root() || hard_error) {
Expand All @@ -358,6 +378,23 @@ bool filesystem_extractor_<LoggerPolicy>::extract(

auto inode = entry.inode();

if (matcher) {
LOG_TRACE << "checking " << entry.unix_path();
if (inode.is_directory()) {
if (!matched_dirs.contains(entry.fs_path())) {
LOG_TRACE << "skipping directory " << entry.fs_path();
// no need to extract this directory
return;
}
} else {
if (!matcher->match(entry.unix_path())) {
LOG_TRACE << "skipping " << entry.fs_path();
// no match, skip this entry
return;
}
}
}

auto ae = ::archive_entry_new();
auto stbuf = fs.getattr(inode);

Expand Down
31 changes: 31 additions & 0 deletions test/tool_main_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2000,6 +2000,37 @@ TEST(dwarfsextract_test, mtree) {
EXPECT_THAT(out, ::testing::HasSubstr("type=file"));
}

TEST(dwarfsextract_test, patterns) {
auto mkdt = mkdwarfs_tester::create_empty();
mkdt.add_test_file_tree();
ASSERT_EQ(0, mkdt.run({"-i", "/", "-o", "-", "--with-devices"}) != 0)
<< mkdt.err();
auto t = dwarfsextract_tester::create_with_image(mkdt.out());
ASSERT_EQ(0, t.run({"-i", "image.dwarfs", "-f", "mtree", "**/*.enc",
"{dev,etc,lib,var}/[m-ot-z]*"}))
<< t.err();
auto out = t.out();
EXPECT_TRUE(out.starts_with("#mtree")) << out;
std::vector<std::string> const expected{
"./dev",
"./dev/tty37",
"./etc",
"./etc/netconfig",
"./usr",
"./usr/lib64",
"./usr/lib64/tcl8.6",
"./usr/lib64/tcl8.6/encoding",
"./usr/lib64/tcl8.6/encoding/cp950.enc",
"./usr/lib64/tcl8.6/encoding/iso8859-8.enc",
};
auto mtree = test::parse_mtree(out);
std::vector<std::string> actual;
for (auto const& entry : mtree) {
actual.push_back(entry.first);
}
EXPECT_EQ(expected, actual);
}

TEST(dwarfsextract_test, stdout_progress_error) {
auto t = dwarfsextract_tester::create_with_image();
EXPECT_NE(0,
Expand Down
22 changes: 20 additions & 2 deletions tools/src/dwarfsextract_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <boost/program_options.hpp>

#include <dwarfs/config.h>
#include <dwarfs/glob_matcher.h>
#include <dwarfs/library_dependencies.h>
#include <dwarfs/logger.h>
#include <dwarfs/mmap.h>
Expand Down Expand Up @@ -77,6 +78,9 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
("output,o",
po_sys_value<sys_string>(&output),
"output file or directory")
("pattern",
po::value<std::vector<std::string>>(),
"only extract files matching these patterns")
("image-offset,O",
po::value<std::string>(&image_offset)->default_value("auto"),
"filesystem image offset in bytes")
Expand Down Expand Up @@ -111,10 +115,17 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {

tool::add_common_options(opts, logopts);

po::positional_options_description pos;
pos.add("pattern", -1);

po::variables_map vm;

try {
po::store(po::parse_command_line(argc, argv, opts), vm);
po::store(po::basic_command_line_parser<sys_char>(argc, argv)
.options(opts)
.positional(pos)
.run(),
vm);
po::notify(vm);
} catch (po::error const& e) {
iol.err << "error: " << e.what() << "\n";
Expand All @@ -141,6 +152,13 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
return 0;
}

std::unique_ptr<glob_matcher> matcher;

if (vm.count("pattern")) {
matcher = std::make_unique<glob_matcher>(
vm["pattern"].as<std::vector<std::string>>());
}

int rv = 0;

try {
Expand Down Expand Up @@ -214,7 +232,7 @@ int dwarfsextract_main(int argc, sys_char** argv, iolayer const& iol) {
};
}

rv = fsx.extract(fs, fsx_opts) ? 0 : 2;
rv = fsx.extract(fs, matcher.get(), fsx_opts) ? 0 : 2;

fsx.close();

Expand Down

0 comments on commit 9e7760b

Please sign in to comment.