daemon/containerfs_linux.go
package daemon // import "github.com/docker/docker/daemon"
import (
"context"
"errors"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
"github.com/containerd/log"
"github.com/hashicorp/go-multierror"
"github.com/moby/sys/mount"
"github.com/moby/sys/symlink"
"golang.org/x/sys/unix"
containertypes "github.com/docker/docker/api/types/container"
"github.com/docker/docker/container"
"github.com/docker/docker/internal/mounttree"
"github.com/docker/docker/internal/unshare"
"github.com/docker/docker/pkg/fileutils"
)
type future struct {
fn func() error
res chan<- error
}
// containerFSView allows functions to be run in the context of a container's
// filesystem. Inside these functions, the root directory is the container root
// for all native OS filesystem APIs, including, but not limited to, the [os]
// and [golang.org/x/sys/unix] packages. The view of the container's filesystem
// is live and read-write. Each view has its own private set of tmpfs mounts.
// Any files written under a tmpfs mount are not visible to processes inside the
// container nor any other view of the container's filesystem, and vice versa.
//
// Each view has its own current working directory which is initialized to the
// root of the container filesystem and can be changed with [os.Chdir]. Changes
// to the current directory persist across successive [*containerFSView.RunInFS]
// and [*containerFSView.GoInFS] calls.
//
// Multiple views of the same container filesystem can coexist at the same time.
// Only one function can be running in a particular filesystem view at any given
// time. Calls to [*containerFSView.RunInFS] or [*containerFSView.GoInFS] will
// block while another function is running. If more than one call is blocked
// concurrently, the order they are unblocked is undefined.
type containerFSView struct {
d *Daemon
ctr *container.Container
todo chan future
done chan error
}
// openContainerFS opens a new view of the container's filesystem.
func (daemon *Daemon) openContainerFS(ctr *container.Container) (_ *containerFSView, retErr error) {
ctx := context.TODO()
if err := daemon.Mount(ctr); err != nil {
return nil, err
}
defer func() {
if retErr != nil {
if err := daemon.Unmount(ctr); err != nil {
log.G(ctx).WithError(err).Debug("Failed to unmount container after failure")
}
}
}()
mounts, cleanup, err := daemon.setupMounts(ctx, ctr)
if err != nil {
return nil, err
}
defer func() {
ctx := context.WithoutCancel(ctx)
if err := cleanup(ctx); err != nil {
log.G(ctx).WithError(err).Debug("Failed to cleanup container mounts")
}
if retErr != nil {
if err := ctr.UnmountVolumes(ctx, daemon.LogVolumeEvent); err != nil {
log.G(ctx).WithError(err).Debug("Failed to unmount container volumes after failure")
}
}
}()
// Setup in initial mount namespace complete. We're ready to unshare the
// mount namespace and bind the volume mounts into that private view of
// the container FS.
todo := make(chan future)
done := make(chan error)
err = unshare.Go(unix.CLONE_NEWNS,
func() error {
if err := mount.MakeRSlave("/"); err != nil {
return err
}
for _, m := range mounts {
dest, err := ctr.GetResourcePath(m.Destination)
if err != nil {
return err
}
var stat os.FileInfo
stat, err = os.Stat(m.Source)
if err != nil {
return err
}
if err := fileutils.CreateIfNotExists(dest, stat.IsDir()); err != nil {
return err
}
bindMode := "rbind"
if m.NonRecursive {
bindMode = "bind"
}
writeMode := "ro"
if m.Writable {
writeMode = "rw"
if m.ReadOnlyNonRecursive {
return errors.New("options conflict: Writable && ReadOnlyNonRecursive")
}
if m.ReadOnlyForceRecursive {
return errors.New("options conflict: Writable && ReadOnlyForceRecursive")
}
}
if m.ReadOnlyNonRecursive && m.ReadOnlyForceRecursive {
return errors.New("options conflict: ReadOnlyNonRecursive && ReadOnlyForceRecursive")
}
// openContainerFS() is called for temporary mounts
// outside the container. Soon these will be unmounted
// with lazy unmount option and given we have mounted
// them rbind, all the submounts will propagate if these
// are shared. If daemon is running in host namespace
// and has / as shared then these unmounts will
// propagate and unmount original mount as well. So make
// all these mounts rprivate. Do not use propagation
// property of volume as that should apply only when
// mounting happens inside the container.
opts := strings.Join([]string{bindMode, writeMode, "rprivate"}, ",")
if err := mount.Mount(m.Source, dest, "", opts); err != nil {
return err
}
if !m.Writable && !m.ReadOnlyNonRecursive {
if err := makeMountRRO(dest); err != nil {
if m.ReadOnlyForceRecursive {
return err
} else {
log.G(context.TODO()).WithError(err).Debugf("Failed to make %q recursively read-only", dest)
}
}
}
}
return mounttree.SwitchRoot(ctr.BaseFS)
},
func() {
defer close(done)
for it := range todo {
err := it.fn()
if it.res != nil {
it.res <- err
}
}
// The thread will terminate when this goroutine returns, taking the
// mount namespace and all the volume bind-mounts with it.
},
)
if err != nil {
return nil, err
}
vw := &containerFSView{
d: daemon,
ctr: ctr,
todo: todo,
done: done,
}
runtime.SetFinalizer(vw, (*containerFSView).Close)
return vw, nil
}
// RunInFS synchronously runs fn in the context of the container filesystem and
// passes through its return value.
//
// The container filesystem is only visible to functions called in the same
// goroutine as fn. Goroutines started from fn will see the host's filesystem.
func (vw *containerFSView) RunInFS(ctx context.Context, fn func() error) error {
res := make(chan error)
select {
case vw.todo <- future{fn: fn, res: res}:
case <-ctx.Done():
return ctx.Err()
}
return <-res
}
// GoInFS starts fn in the container FS. It blocks until fn is started but does
// not wait until fn returns. An error is returned if ctx is canceled before fn
// has been started.
//
// The container filesystem is only visible to functions called in the same
// goroutine as fn. Goroutines started from fn will see the host's filesystem.
func (vw *containerFSView) GoInFS(ctx context.Context, fn func()) error {
select {
case vw.todo <- future{fn: func() error { fn(); return nil }}:
return nil
case <-ctx.Done():
return ctx.Err()
}
}
// Close waits until any in-flight operations complete and frees all
// resources associated with vw.
func (vw *containerFSView) Close() error {
runtime.SetFinalizer(vw, nil)
close(vw.todo)
err := multierror.Append(nil, <-vw.done)
err = multierror.Append(err, vw.ctr.UnmountVolumes(context.TODO(), vw.d.LogVolumeEvent))
err = multierror.Append(err, vw.d.Unmount(vw.ctr))
return err.ErrorOrNil()
}
// Stat returns the metadata for path, relative to the current working directory
// of vw inside the container filesystem view.
func (vw *containerFSView) Stat(ctx context.Context, path string) (*containertypes.PathStat, error) {
var stat *containertypes.PathStat
err := vw.RunInFS(ctx, func() error {
lstat, err := os.Lstat(path)
if err != nil {
return err
}
var target string
if lstat.Mode()&os.ModeSymlink != 0 {
// Fully evaluate symlinks along path to the ultimate
// target, or as much as possible with broken links.
target, err = symlink.FollowSymlinkInScope(path, "/")
if err != nil {
return err
}
}
stat = &containertypes.PathStat{
Name: filepath.Base(path),
Size: lstat.Size(),
Mode: lstat.Mode(),
Mtime: lstat.ModTime(),
LinkTarget: target,
}
return nil
})
return stat, err
}
// makeMountRRO makes the mount recursively read-only.
func makeMountRRO(dest string) error {
attr := &unix.MountAttr{
Attr_set: unix.MOUNT_ATTR_RDONLY,
}
var err error
for {
err = unix.MountSetattr(-1, dest, unix.AT_RECURSIVE, attr)
if !errors.Is(err, unix.EINTR) {
break
}
}
if err != nil {
err = fmt.Errorf("failed to apply MOUNT_ATTR_RDONLY with AT_RECURSIVE to %q: %w", dest, err)
}
return err
}