dotcloud/docker

View on GitHub
internal/unshare/unshare_linux.go

Summary

Maintainability
B
4 hrs
Test Coverage
//go:build go1.10

package unshare // import "github.com/docker/docker/internal/unshare"

import (
    "fmt"
    "os"
    "runtime"

    "golang.org/x/sys/unix"
)

func init() {
    // The startup thread of a process is special in a few different ways.
    // Most pertinent to the discussion at hand, any per-thread kernel state
    // reflected in the /proc/[pid]/ directory for a process is taken from
    // the state of the startup thread. Same goes for /proc/self/; it shows
    // the state of the current process' startup thread, no matter which
    // thread the files are being opened from. For most programs this is a
    // distinction without a difference as the kernel state, such as the
    // mount namespace and current working directory, is shared among (and
    // kept synchronized across) all threads of a process. But things start
    // to break down once threads start unsharing and modifying parts of
    // their kernel state.
    //
    // The Go runtime schedules goroutines to execute on the startup thread,
    // same as any other. How this could be problematic is best illustrated
    // with a concrete example. Consider what happens if a call to
    // Go(unix.CLONE_NEWNS, ...) spawned a goroutine which gets scheduled
    // onto the startup thread. The thread's mount namespace will be
    // unshared and modified. The contents of the /proc/[pid]/mountinfo file
    // will then describe the mount tree of the unshared namespace, not the
    // namespace of any other thread. It will remain this way until the
    // process exits. (The startup thread is special in another way: exiting
    // it puts the process into a "non-waitable zombie" state. To avoid this
    // fate, the Go runtime parks the thread instead of exiting if a
    // goroutine returns while locked to the startup thread. More
    // information can be found in the Go runtime sources:
    // `go doc -u -src runtime.mexit`.) The github.com/moby/sys/mountinfo
    // package reads from /proc/self/mountinfo, so will read the mount tree
    // for the wrong namespace if the startup thread has had its mount
    // namespace unshared! The /proc/thread-self/ directory, introduced in
    // Linux 3.17, is one potential solution to this problem, but every
    // package which opens files in /proc/self/ would need to be updated,
    // and fallbacks to /proc/self/task/[tid]/ would be required to support
    // older kernels. Overlooking any reference to /proc/self/ would
    // manifest as stochastically-reproducible bugs, so this is far from an
    // ideal solution.
    //
    // Reading from /proc/self/ would not be a problem if we could prevent
    // the per-thread state of the startup thread from being modified
    // nondeterministically in the first place. We can accomplish this
    // simply by locking the main() function to the startup thread! Doing so
    // excludes any other goroutine from being scheduled on the thread.
    runtime.LockOSThread()
}

// reversibleSetnsFlags maps the unshare(2) flags whose effects can be fully
// reversed using setns(2). The values are the basenames of the corresponding
// /proc/self/task/[tid]/ns/ magic symlinks to use to save and restore the
// state.
var reversibleSetnsFlags = map[int]string{
    unix.CLONE_NEWCGROUP: "cgroup",
    unix.CLONE_NEWNET:    "net",
    unix.CLONE_NEWUTS:    "uts",
    unix.CLONE_NEWPID:    "pid",
    unix.CLONE_NEWTIME:   "time",

    // The following CLONE_NEW* flags are not included because they imply
    // another, irreversible flag when used with unshare(2).
    //  - unix.CLONE_NEWIPC:  implies CLONE_SYSVMEM
    //  - unix.CLONE_NEWNS:   implies CLONE_FS
    //  - unix.CLONE_NEWUSER: implies CLONE_FS since Linux 3.9
}

// Go calls the given functions in a new goroutine, locked to an OS thread,
// which has had the parts of its execution state disassociated from the rest of
// the current process using [unshare(2)]. It blocks until the new goroutine has
// started and setupfn has returned. fn is only called if setupfn returns nil. A
// nil setupfn or fn is equivalent to passing a no-op function.
//
// The disassociated execution state and any changes made to it are only visible
// to the goroutine which the functions are called in. Any other goroutines,
// including ones started from the function, will see the same execution state
// as the rest of the process.
//
// The acceptable flags are documented in the [unshare(2)] Linux man-page.
// The corresponding CLONE_* constants are defined in package [unix].
//
// # Warning
//
// This function may terminate the thread which the new goroutine executed on
// after fn returns, which could cause subprocesses started with the
// [syscall.SysProcAttr] Pdeathsig field set to be signaled before process
// termination. Any subprocess started before this function is called may be
// affected, in addition to any subprocesses started inside setupfn or fn.
// There are more details at https://go.dev/issue/27505.
//
// [unshare(2)]: https://man7.org/linux/man-pages/man2/unshare.2.html
func Go(flags int, setupfn func() error, fn func()) error {
    started := make(chan error)

    maskedFlags := flags
    for f := range reversibleSetnsFlags {
        maskedFlags &^= f
    }
    isReversible := maskedFlags == 0

    go func() {
        // Prepare to manipulate per-thread kernel state.
        runtime.LockOSThread()

        // Not all changes to the execution state can be reverted.
        // If an irreversible change to the execution state is made, our
        // only recourse is to have the tampered thread terminated by
        // returning from this function while the goroutine remains
        // wired to the thread. The Go runtime will terminate the thread
        // and replace it with a fresh one as needed.

        if isReversible {
            defer func() {
                if isReversible {
                    // All execution state has been restored without error.
                    // The thread is once again fungible.
                    runtime.UnlockOSThread()
                }
            }()
            tid := unix.Gettid()
            for f, ns := range reversibleSetnsFlags {
                if flags&f != f {
                    continue
                }
                // The /proc/thread-self directory was added in Linux 3.17.
                // We are not using it to maximize compatibility.
                pth := fmt.Sprintf("/proc/self/task/%d/ns/%s", tid, ns)
                fd, err := unix.Open(pth, unix.O_RDONLY|unix.O_CLOEXEC, 0)
                if err != nil {
                    started <- &os.PathError{Op: "open", Path: pth, Err: err}
                    return
                }
                defer func() {
                    if isReversible {
                        if err := unix.Setns(fd, 0); err != nil {
                            isReversible = false
                        }
                    }
                    _ = unix.Close(fd)
                }()
            }
        }

        // Threads are implemented under Linux as processes which share
        // a virtual memory space. Therefore in a multithreaded process
        // unshare(2) disassociates parts of the calling thread's
        // context from the thread it was clone(2)'d from.
        if err := unix.Unshare(flags); err != nil {
            started <- os.NewSyscallError("unshare", err)
            return
        }

        if setupfn != nil {
            if err := setupfn(); err != nil {
                started <- err
                return
            }
        }
        close(started)

        if fn != nil {
            fn()
        }
    }()

    return <-started
}