Skip to content

Commit

Permalink
libpod: enable reflink copying for checkpoint tar files.
Browse files Browse the repository at this point in the history
This eliminates the overhead of handling rootfs diffs. Without these
changes, on my personal machine (T480 [email protected], Fedora 41 with BTRFS)
the below test takes 29s (checkpoint) and 18s (restore) to complete;
with these changes, it is 2.9s (checkpoint) and 2.0s (restore).

There are essentially two changes:

* set TarOptions.AlignBlockFile for creating snapshot and rootfs-diff
  tar files.  This is necessary for BTRFS to share data blocks when
  using copy_file_range ("reflink").

* (un)pack the rootfs diff through the /diff layer, rather than the
  /merged mountpoint, by simply replace "/merged" with "/diff" in the
  path string. Without this change, the source is on a overlayfs
  mount, and the destination on BTRFS, and copy_file_range would fail
  with EXDEV.

Replacing the path suffix was done as a hack. There are two ways to do
this more cleanly:

1) extend the podman internal APIs so it offers a principled manner to
access the layer paths directly.

2) implement cross-FS copy_file_range() in the Linux' overlayfs:
overlayfs could dereference the files to their layers, check if the
resulting source and destination are on the same filesystem, and then
delegate to that filesystems' copy_file_range implementation.

This needs extending the VFS definition, so a filesystem can signal
that it knows how to do cross FS-type copy_file_range. Currently, this
is forbidden through a check in generic_copy_file_checks() in
fs/read_write.c.

Approach 2) would speed up all of podman's tar handling in one fell
swoop, while 1) would also work with older kernel versions, but would
require updating all relevant callsites in podman.

Timing for checkpointing 5Gb of rootfs-diffs:

Command:

    (cd ../containers/podman/ && go build -tags "selinux seccomp" ./cmd/podman) &&    sudo podman container rm -f -t0 restored && echo 1 &&     sudo ~/vc/containers/podman/podman run -dt -p 8080:80/tcp docker.io/library/httpd &&     echo 2 &&     sudo ~/vc/containers/podman/podman  --log-level=debug container cp files.sh  $(sudo ~/vc/containers/podman/podman  ps  -l --format "{{.ID}}"):files.sh &&    echo 3 &&     sudo ~/vc/containers/podman/podman container exec -it  -l /bin/sh -x files.sh &&    echo 4 &&     sudo /usr/bin/time ~/vc/containers/podman/podman --log-level=debug container checkpoint --compress=none -e foo.tar  -l &&     echo 5 &&     sudo /usr/bin/time ~/vc/containers/podman/podman --log-level=debug container restore -i foo.tar --name restored

files.sh:

    $ cat files.sh
    #!/bin/sh
    dd if=/dev/urandom of=test bs=1M count=100

    echo hoi >> test
    for i in $(seq 1 50); do
        cp --reflink=never test test$i &
    done

    wait
    rm /usr/bin/znew

..
[INFO]:Jan 17 19:44:36.137 - processed getdiff (0 byte) in 423.148002ms: 0.000000 gb/s
[INFO]:Jan 17 19:44:36.137 - processed usr/local/apache2/test32 (104857604 byte) in 97.048µs: 1006.267555 gb/s
[INFO]:Jan 17 19:44:36.224 - processed usr/local/apache2/test34 (104857604 byte) in 86.024792ms: 1.135211 gb/s
[INFO]:Jan 17 19:44:36.224 - processed usr/local/apache2/test49
(104857604 byte) in 83.593µs: 1168.234825 gb/s
...
[INFO]:Jan 17 19:44:37.666 - processed checkpoint/tty-info.img (186 byte) in 563.93µs: 0.000307 gb/s
[INFO]:Jan 17 19:44:37.667 - processed checkpoint/utsns-12.img (34 byte) in 517.281µs: 0.000061 gb/s
[INFO]:Jan 17 19:44:37.670 - processed rootfs-diff.tar (5347956224 byte) in 2.52236ms: 1974.608049 gb/s
..
0.44user 1.66system 0:02.93elapsed 71%CPU (0avgtext+0avgdata 51736maxresident)k
2400inputs+34248outputs (1major+14586minor)pagefaults 0swaps

Restore:

...
[INFO]:Jan 17 19:44:38.817 - processed checkpoint/utsns-12.img (34 byte) in 278.176µs: 0.000114 gb/s
[INFO]:Jan 17 19:44:38.821 - processed rootfs-diff.tar (5347956224
byte) in 2.431477ms: 2048.414342 gb/s
...

[DEBUG]:Jan 17 19:44:38.833 - Mounted container
"78f739866462dd6a69e596fcd43bcd45409db7176410930a208de4ae862a439f" at
"/var/lib/containers/storage/overlay/27da5a8326fef0ca4887eb55297ff7fd6998f1e4b01d9cb2d630e0a4b9938359/merged"
...
[INFO]:Jan 17 19:44:39.124 - processed usr/local/apache2/test32 (104857604 byte) in 643.656µs: 151.721189 gb/s
...
[INFO]:Jan 17 19:44:39.175 - processed rootfsdiff (5347956224 byte) in
51.784872ms: 96.180065 gb/s
...
0.33user 0.26system 0:02.01elapsed 29%CPU (0avgtext+0avgdata 48336maxresident)k
728inputs+3296outputs (0major+7097minor)pagefaults 0swaps
  • Loading branch information
hanwen-flow authored and hanwen committed Jan 18, 2025
1 parent bd22a0d commit dd54c88
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 37 deletions.
73 changes: 48 additions & 25 deletions libpod/container_internal_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ import (
"github.com/containers/podman/v5/pkg/util"
"github.com/containers/podman/v5/version"
"github.com/containers/storage/pkg/archive"
tar "github.com/containers/storage/pkg/archive/hacktar"
"github.com/containers/storage/pkg/fileutils"
"github.com/containers/storage/pkg/idtools"
"github.com/containers/storage/pkg/lockfile"
Expand Down Expand Up @@ -1106,13 +1107,20 @@ func (c *Container) exportCheckpoint(options ContainerCheckpointOptions) error {
// Get root file-system changes included in the checkpoint archive
var addToTarFiles []string
if !options.IgnoreRootfs {
t := archive.Timer("getdiff", 0)
// To correctly track deleted files, let's go through the output of 'podman diff'
rootFsChanges, err := c.runtime.GetDiff("", c.ID(), define.DiffContainer)
if err != nil {
return fmt.Errorf("exporting root file-system diff for %q: %w", c.ID(), err)
}
t()

addToTarFiles, err := crutils.CRCreateRootFsDiffTar(&rootFsChanges, c.state.Mountpoint, c.bundlePath())
// TODO - this hack is not production quality
mp := c.state.Mountpoint
if filepath.Base(mp) == "merged" {
mp = filepath.Join(filepath.Dir(mp), "diff")
}
addToTarFiles, err := crutils.CRCreateRootFsDiffTar(&rootFsChanges, mp, c.bundlePath())
if err != nil {
return err
}
Expand Down Expand Up @@ -1151,6 +1159,7 @@ func (c *Container) exportCheckpoint(options ContainerCheckpointOptions) error {
return fmt.Errorf("volume %s is not mounted, cannot export: %w", volume.Name(), define.ErrInternal)
}

// TODO - should also use TarWithOptionsTo
input, err := archive.TarWithOptions(mp, &archive.TarOptions{
Compression: archive.Uncompressed,
IncludeSourceDir: true,
Expand All @@ -1169,28 +1178,35 @@ func (c *Container) exportCheckpoint(options ContainerCheckpointOptions) error {
}
}

input, err := archive.TarWithOptions(c.bundlePath(), &archive.TarOptions{
Compression: options.Compression,
IncludeSourceDir: true,
IncludeFiles: includeFiles,
})

if err != nil {
return fmt.Errorf("reading checkpoint directory %q: %w", c.ID(), err)
}

outFile, err := os.Create(options.TargetFile)
if err != nil {
return fmt.Errorf("creating checkpoint export file %q: %w", options.TargetFile, err)
}
defer outFile.Close()
defer outFile.Close() // error handling?

if err := os.Chmod(options.TargetFile, 0600); err != nil {
return err
var dest io.WriteCloser
dest = outFile

opts := &archive.TarOptions{
IncludeSourceDir: true,
IncludeFiles: includeFiles,
}
if options.Compression != archive.Uncompressed {
dest, err = archive.CompressStream(dest, options.Compression)
if err != nil {
return err
}
defer dest.Close()
} else {
opts.AlignBlockFile = outFile
}
tw := tar.NewWriter(dest)
defer tw.Close()
if err := archive.TarWithOptionsTo(c.bundlePath(), tw, opts); err != nil {
return fmt.Errorf("reading checkpoint directory %q: %w", c.ID(), err)
}

_, err = io.Copy(outFile, input)
if err != nil {
if err := os.Chmod(options.TargetFile, 0600); err != nil {
return err
}

Expand Down Expand Up @@ -1253,17 +1269,15 @@ func (c *Container) checkpoint(ctx context.Context, options ContainerCheckpointO
if err != nil {
return nil, 0, err
}
defer shmDirTarFile.Close()
defer shmDirTarFile.Close() // no error handling?

input, err := archive.TarWithOptions(c.config.ShmDir, &archive.TarOptions{
tw := tar.NewWriter(shmDirTarFile)
defer tw.Close()
if err := archive.TarWithOptionsTo(c.config.ShmDir, tw, &archive.TarOptions{
Compression: archive.Uncompressed,
IncludeSourceDir: true,
})
if err != nil {
return nil, 0, err
}

if _, err = io.Copy(shmDirTarFile, input); err != nil {
AlignBlockFile: shmDirTarFile,
}); err != nil {
return nil, 0, err
}
}
Expand Down Expand Up @@ -1464,9 +1478,13 @@ func (c *Container) restore(ctx context.Context, options ContainerCheckpointOpti
}

if options.TargetFile != "" {
fi, _ := os.Stat(options.TargetFile)
t := archive.Timer(options.TargetFile, int(fi.Size()))

if err := c.importCheckpointTar(options.TargetFile); err != nil {
return nil, 0, err
}
t()
} else if options.CheckpointImageID != "" {
if err := c.importCheckpointImage(ctx, options.CheckpointImageID); err != nil {
return nil, 0, err
Expand Down Expand Up @@ -1742,7 +1760,12 @@ func (c *Container) restore(ctx context.Context, options ContainerCheckpointOpti

// Before actually restarting the container, apply the root file-system changes
if !options.IgnoreRootfs {
if err := crutils.CRApplyRootFsDiffTar(c.bundlePath(), c.state.Mountpoint); err != nil {
mp := c.state.Mountpoint
// TODO - this is not production quality
if filepath.Base(mp) == "merged" {
mp = filepath.Join(filepath.Dir(mp), "diff")
}
if err := crutils.CRApplyRootFsDiffTar(c.bundlePath(), mp); err != nil {
return nil, 0, err
}

Expand Down
24 changes: 12 additions & 12 deletions pkg/checkpoint/crutils/checkpoint_restore_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ import (
"bytes"
"errors"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"

metadata "github.com/checkpoint-restore/checkpointctl/lib"
"github.com/checkpoint-restore/go-criu/v7/stats"
"github.com/containers/storage/pkg/archive"
tar "github.com/containers/storage/pkg/archive/hacktar"
"github.com/opencontainers/selinux/go-selinux/label"
)

Expand Down Expand Up @@ -108,6 +108,8 @@ func CRApplyRootFsDiffTar(baseDirectory, containerRootDirectory string) error {
return fmt.Errorf("failed to open root file-system diff file: %w", err)
}
defer rootfsDiffFile.Close()
fi, _ := rootfsDiffFile.Stat()
defer archive.Timer("rootfsdiff", int(fi.Size()))()

if err := archive.Untar(rootfsDiffFile, containerRootDirectory, nil); err != nil {
return fmt.Errorf("failed to apply root file-system diff file %s: %w", rootfsDiffPath, err)
Expand Down Expand Up @@ -152,21 +154,19 @@ func CRCreateRootFsDiffTar(changes *[]archive.Change, mountPoint, destination st
}

if len(rootfsIncludeFiles) > 0 {
rootfsTar, err := archive.TarWithOptions(mountPoint, &archive.TarOptions{
Compression: archive.Uncompressed,
IncludeSourceDir: true,
IncludeFiles: rootfsIncludeFiles,
})
if err != nil {
return includeFiles, fmt.Errorf("exporting root file-system diff to %q: %w", rootfsDiffPath, err)
}
rootfsDiffFile, err := os.Create(rootfsDiffPath)
if err != nil {
return includeFiles, fmt.Errorf("creating root file-system diff file %q: %w", rootfsDiffPath, err)
}
defer rootfsDiffFile.Close()
if _, err = io.Copy(rootfsDiffFile, rootfsTar); err != nil {
return includeFiles, err
defer rootfsDiffFile.Close() // error handling?
tw := tar.NewWriter(rootfsDiffFile)
defer tw.Close()
if err := archive.TarWithOptionsTo(mountPoint, tw, &archive.TarOptions{
IncludeSourceDir: true,
IncludeFiles: rootfsIncludeFiles,
AlignBlockFile: rootfsDiffFile,
}); err != nil {
return includeFiles, fmt.Errorf("exporting root file-system diff to %q: %w", rootfsDiffPath, err)
}

includeFiles = append(includeFiles, metadata.RootFsDiffTar)
Expand Down

0 comments on commit dd54c88

Please sign in to comment.