dotcloud/docker

View on GitHub
daemon/oci_windows.go

Summary

Maintainability
D
1 day
Test Coverage
package daemon // import "github.com/docker/docker/daemon"

import (
    "context"
    "encoding/json"
    "fmt"
    "os"
    "path/filepath"
    "strings"

    "github.com/Microsoft/hcsshim"
    coci "github.com/containerd/containerd/oci"
    "github.com/containerd/log"
    "github.com/docker/docker/api/types/backend"
    containertypes "github.com/docker/docker/api/types/container"
    "github.com/docker/docker/container"
    "github.com/docker/docker/daemon/config"
    "github.com/docker/docker/errdefs"
    "github.com/docker/docker/image"
    "github.com/docker/docker/oci"
    "github.com/docker/docker/pkg/sysinfo"
    "github.com/docker/docker/pkg/system"
    specs "github.com/opencontainers/runtime-spec/specs-go"
    "github.com/pkg/errors"
    "golang.org/x/sys/windows/registry"
)

const (
    credentialSpecRegistryLocation = `SOFTWARE\Microsoft\Windows NT\CurrentVersion\Virtualization\Containers\CredentialSpecs`
    credentialSpecFileLocation     = "CredentialSpecs"
)

// setupContainerDirs sets up base container directories (root, ipc, tmpfs and secrets).
func (daemon *Daemon) setupContainerDirs(c *container.Container) ([]container.Mount, error) {
    // Note, unlike Unix, we do NOT call into SetupWorkingDirectory as
    // this is done in VMCompute. Further, we couldn't do it for Hyper-V
    // containers anyway.
    if err := daemon.setupSecretDir(c); err != nil {
        return nil, err
    }

    if err := daemon.setupConfigDir(c); err != nil {
        return nil, err
    }

    // If the container has not been started, and has configs or secrets
    // secrets, create symlinks to each config and secret. If it has been
    // started before, the symlinks should have already been created. Also, it
    // is important to not mount a Hyper-V  container that has been started
    // before, to protect the host from the container; for example, from
    // malicious mutation of NTFS data structures.
    if !c.HasBeenStartedBefore && (len(c.SecretReferences) > 0 || len(c.ConfigReferences) > 0) {
        // The container file system is mounted before this function is called,
        // except for Hyper-V containers, so mount it here in that case.
        if daemon.isHyperV(c) {
            if err := daemon.Mount(c); err != nil {
                return nil, err
            }
            defer daemon.Unmount(c)
        }
        if err := c.CreateSecretSymlinks(); err != nil {
            return nil, err
        }
        if err := c.CreateConfigSymlinks(); err != nil {
            return nil, err
        }
    }

    secretMounts, err := c.SecretMounts()
    if err != nil {
        return nil, err
    }

    var mounts []container.Mount
    if secretMounts != nil {
        mounts = append(mounts, secretMounts...)
    }

    if configMounts := c.ConfigMounts(); configMounts != nil {
        mounts = append(mounts, configMounts...)
    }

    return mounts, nil
}

func (daemon *Daemon) isHyperV(c *container.Container) bool {
    if c.HostConfig.Isolation.IsDefault() {
        // Container using default isolation, so take the default from the daemon configuration
        return daemon.defaultIsolation.IsHyperV()
    }
    // Container may be requesting an explicit isolation mode.
    return c.HostConfig.Isolation.IsHyperV()
}

func (daemon *Daemon) createSpec(ctx context.Context, daemonCfg *configStore, c *container.Container, mounts []container.Mount) (*specs.Spec, error) {
    img, err := daemon.imageService.GetImage(ctx, string(c.ImageID), backend.GetImageOpts{})
    if err != nil {
        return nil, err
    }
    if err := image.CheckOS(img.OperatingSystem()); err != nil {
        return nil, err
    }

    s := oci.DefaultSpec()

    if err := coci.WithAnnotations(c.HostConfig.Annotations)(ctx, nil, nil, &s); err != nil {
        return nil, err
    }

    for _, mount := range mounts {
        m := specs.Mount{
            Source:      mount.Source,
            Destination: mount.Destination,
        }
        if !mount.Writable {
            m.Options = append(m.Options, "ro")
        }
        s.Mounts = append(s.Mounts, m)
    }

    linkedEnv, err := daemon.setupLinkedContainers(c)
    if err != nil {
        return nil, err
    }

    isHyperV := daemon.isHyperV(c)
    if isHyperV {
        s.Windows.HyperV = &specs.WindowsHyperV{}
    }

    // In s.Process
    s.Process.Cwd = c.Config.WorkingDir
    s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
    s.Process.Terminal = c.Config.Tty

    if c.Config.Tty {
        s.Process.ConsoleSize = &specs.Box{
            Height: c.HostConfig.ConsoleSize[0],
            Width:  c.HostConfig.ConsoleSize[1],
        }
    }
    s.Process.User.Username = c.Config.User
    s.Windows.LayerFolders, err = daemon.imageService.GetLayerFolders(img, c.RWLayer, c.ID)
    if err != nil {
        return nil, errors.Wrapf(err, "GetLayerFolders failed: container %s", c.ID)
    }

    // Get endpoints for the libnetwork allocated networks to the container
    var epList []string
    AllowUnqualifiedDNSQuery := false
    gwHNSID := ""
    if c.NetworkSettings != nil {
        for n := range c.NetworkSettings.Networks {
            sn, err := daemon.FindNetwork(n)
            if err != nil {
                continue
            }

            ep, err := getEndpointInNetwork(c.Name, sn)
            if err != nil {
                continue
            }

            data, err := ep.DriverInfo()
            if err != nil {
                continue
            }

            if data["GW_INFO"] != nil {
                gwInfo := data["GW_INFO"].(map[string]interface{})
                if gwInfo["hnsid"] != nil {
                    gwHNSID = gwInfo["hnsid"].(string)
                }
            }

            if data["hnsid"] != nil {
                epList = append(epList, data["hnsid"].(string))
            }

            if data["AllowUnqualifiedDNSQuery"] != nil {
                AllowUnqualifiedDNSQuery = true
            }
        }
    }

    var networkSharedContainerID string
    if c.HostConfig.NetworkMode.IsContainer() {
        networkSharedContainerID = c.NetworkSharedContainerID
        for _, ep := range c.SharedEndpointList {
            epList = append(epList, ep)
        }
    }

    if gwHNSID != "" {
        epList = append(epList, gwHNSID)
    }

    var dnsSearch []string
    if len(c.HostConfig.DNSSearch) > 0 {
        dnsSearch = c.HostConfig.DNSSearch
    } else if len(daemonCfg.DNSSearch) > 0 {
        dnsSearch = daemonCfg.DNSSearch
    }

    s.Windows.Network = &specs.WindowsNetwork{
        AllowUnqualifiedDNSQuery:   AllowUnqualifiedDNSQuery,
        DNSSearchList:              dnsSearch,
        EndpointList:               epList,
        NetworkSharedContainerName: networkSharedContainerID,
    }

    if err := daemon.createSpecWindowsFields(c, &s, isHyperV); err != nil {
        return nil, err
    }

    if log.G(ctx).Level >= log.DebugLevel {
        if b, err := json.Marshal(&s); err == nil {
            log.G(ctx).Debugf("Generated spec: %s", string(b))
        }
    }

    return &s, nil
}

// Sets the Windows-specific fields of the OCI spec
func (daemon *Daemon) createSpecWindowsFields(c *container.Container, s *specs.Spec, isHyperV bool) error {
    s.Hostname = c.FullHostname()

    if len(s.Process.Cwd) == 0 {
        // We default to C:\ to workaround the oddity of the case that the
        // default directory for cmd running as LocalSystem (or
        // ContainerAdministrator) is c:\windows\system32. Hence docker run
        // <image> cmd will by default end in c:\windows\system32, rather
        // than 'root' (/) on Linux. The oddity is that if you have a dockerfile
        // which has no WORKDIR and has a COPY file ., . will be interpreted
        // as c:\. Hence, setting it to default of c:\ makes for consistency.
        s.Process.Cwd = `C:\`
    }

    if c.Config.ArgsEscaped {
        s.Process.CommandLine = c.Path
        if len(c.Args) > 0 {
            s.Process.CommandLine += " " + system.EscapeArgs(c.Args)
        }
    } else {
        s.Process.Args = append([]string{c.Path}, c.Args...)
    }
    s.Root.Readonly = false // Windows does not support a read-only root filesystem
    if !isHyperV {
        if c.BaseFS == "" {
            return errors.New("createSpecWindowsFields: BaseFS of container " + c.ID + " is unexpectedly empty")
        }

        if daemon.UsesSnapshotter() {
            // daemon.Mount() for the snapshotters actually mounts the filesystem to the host
            // using containerd/mount.All and BaseFS is the directory where this is mounted.
            // This is consistent with Linux-based graphdriver implementations.
            // For the windowsfilter graphdriver, the underlying Get() call does not actually mount
            // the filesystem to a path, and BaseFS is the Volume GUID of the prepared/activated
            // filesystem.

            // The spec for Root.Path for Windows specifies that for Process-isolated containers,
            // it must be in the Volume GUID (\\?\\Volume{GUID} style), not a host-mounted directory.
            backingDevicePath, err := getBackingDeviceForContainerdMount(c.BaseFS)
            if err != nil {
                return errors.Wrapf(err, "createSpecWindowsFields: Failed to get backing device of BaseFS of container %s", c.ID)
            }
            s.Root.Path = backingDevicePath
        } else {
            s.Root.Path = c.BaseFS // This is not set for Hyper-V containers
        }
        if !strings.HasSuffix(s.Root.Path, `\`) {
            s.Root.Path = s.Root.Path + `\` // Ensure a correctly formatted volume GUID path \\?\Volume{GUID}\
        }
    }

    // First boot optimization
    s.Windows.IgnoreFlushesDuringBoot = !c.HasBeenStartedBefore

    setResourcesInSpec(c, s, isHyperV)

    // Read and add credentials from the security options if a credential spec has been provided.
    if err := daemon.setWindowsCredentialSpec(c, s); err != nil {
        return err
    }

    devices, err := setupWindowsDevices(c.HostConfig.Devices)
    if err != nil {
        return err
    }

    s.Windows.Devices = append(s.Windows.Devices, devices...)

    return nil
}

// getBackingDeviceForContainerdMount extracts the backing device or directory mounted at mountPoint
// by containerd's mount.Mount implementation for Windows.
func getBackingDeviceForContainerdMount(mountPoint string) (string, error) {
    // NOTE: This relies on details of the behaviour of containerd's mount implementation for Windows,
    // and so is somewhat fragile.
    // TODO: Upstream this into the mount package.
    // The implementation would be the same, but it'll be better-encapsulated.

    // See containerd/containerd/mount/mount_windows.go
    // This is mostly just copied from mount.Unmount

    const sourceStreamName = "containerd.io-source"

    mountPoint = filepath.Clean(mountPoint)
    adsFile := mountPoint + ":" + sourceStreamName
    var layerPath string

    if _, err := os.Lstat(adsFile); err == nil {
        layerPathb, err := os.ReadFile(mountPoint + ":" + sourceStreamName)
        if err != nil {
            return "", fmt.Errorf("failed to retrieve layer source for mount %s: %w", mountPoint, err)
        }
        layerPath = string(layerPathb)
    }

    if layerPath == "" {
        return "", fmt.Errorf("no layer source for mount %s", mountPoint)
    }

    home, layerID := filepath.Split(layerPath)
    di := hcsshim.DriverInfo{
        HomeDir: home,
    }

    backingDevice, err := hcsshim.GetLayerMountPath(di, layerID)
    if err != nil {
        return "", fmt.Errorf("failed to retrieve backing device for layer %s: %w", mountPoint, err)
    }

    return backingDevice, nil
}

var errInvalidCredentialSpecSecOpt = errdefs.InvalidParameter(fmt.Errorf("invalid credential spec security option - value must be prefixed by 'file://', 'registry://', or 'raw://' followed by a non-empty value"))

// setWindowsCredentialSpec sets the spec's `Windows.CredentialSpec`
// field if relevant
func (daemon *Daemon) setWindowsCredentialSpec(c *container.Container, s *specs.Spec) error {
    if c.HostConfig == nil || c.HostConfig.SecurityOpt == nil {
        return nil
    }

    // TODO (jrouge/wk8): if provided with several security options, we silently ignore
    // all but the last one (provided they're all valid, otherwise we do return an error);
    // this doesn't seem like a great idea?
    credentialSpec := ""

    // TODO(thaJeztah): extract validating and parsing SecurityOpt to a reusable function.
    for _, secOpt := range c.HostConfig.SecurityOpt {
        k, v, ok := strings.Cut(secOpt, "=")
        if !ok {
            return errdefs.InvalidParameter(fmt.Errorf("invalid security option: no equals sign in supplied value %s", secOpt))
        }
        // FIXME(thaJeztah): options should not be case-insensitive
        if !strings.EqualFold(k, "credentialspec") {
            return errdefs.InvalidParameter(fmt.Errorf("security option not supported: %s", k))
        }

        scheme, value, ok := strings.Cut(v, "://")
        if !ok || value == "" {
            return errInvalidCredentialSpecSecOpt
        }
        var err error
        switch strings.ToLower(scheme) {
        case "file":
            credentialSpec, err = readCredentialSpecFile(c.ID, daemon.root, filepath.Clean(value))
            if err != nil {
                return errdefs.InvalidParameter(err)
            }
        case "registry":
            credentialSpec, err = readCredentialSpecRegistry(c.ID, value)
            if err != nil {
                return errdefs.InvalidParameter(err)
            }
        case "config":
            // if the container does not have a DependencyStore, then it
            // isn't swarmkit managed. In order to avoid creating any
            // impression that `config://` is a valid API, return the same
            // error as if you'd passed any other random word.
            if c.DependencyStore == nil {
                return errInvalidCredentialSpecSecOpt
            }

            csConfig, err := c.DependencyStore.Configs().Get(value)
            if err != nil {
                return errdefs.System(errors.Wrap(err, "error getting value from config store"))
            }
            // stuff the resulting secret data into a string to use as the
            // CredentialSpec
            credentialSpec = string(csConfig.Spec.Data)
        case "raw":
            credentialSpec = value
        default:
            return errInvalidCredentialSpecSecOpt
        }
    }

    if credentialSpec != "" {
        if s.Windows == nil {
            s.Windows = &specs.Windows{}
        }
        s.Windows.CredentialSpec = credentialSpec
    }

    return nil
}

func setResourcesInSpec(c *container.Container, s *specs.Spec, isHyperV bool) {
    // In s.Windows.Resources
    cpuShares := uint16(c.HostConfig.CPUShares)
    cpuMaximum := uint16(c.HostConfig.CPUPercent) * 100
    cpuCount := uint64(c.HostConfig.CPUCount)
    if c.HostConfig.NanoCPUs > 0 {
        if isHyperV {
            cpuCount = uint64(c.HostConfig.NanoCPUs / 1e9)
            leftoverNanoCPUs := c.HostConfig.NanoCPUs % 1e9
            if leftoverNanoCPUs != 0 {
                cpuCount++
                cpuMaximum = uint16(c.HostConfig.NanoCPUs / int64(cpuCount) / (1e9 / 10000))
                if cpuMaximum < 1 {
                    // The requested NanoCPUs is so small that we rounded to 0, use 1 instead
                    cpuMaximum = 1
                }
            }
        } else {
            cpuMaximum = uint16(c.HostConfig.NanoCPUs / int64(sysinfo.NumCPU()) / (1e9 / 10000))
            if cpuMaximum < 1 {
                // The requested NanoCPUs is so small that we rounded to 0, use 1 instead
                cpuMaximum = 1
            }
        }
    }

    if cpuMaximum != 0 || cpuShares != 0 || cpuCount != 0 {
        if s.Windows.Resources == nil {
            s.Windows.Resources = &specs.WindowsResources{}
        }
        s.Windows.Resources.CPU = &specs.WindowsCPUResources{
            Maximum: &cpuMaximum,
            Shares:  &cpuShares,
            Count:   &cpuCount,
        }
    }

    memoryLimit := uint64(c.HostConfig.Memory)
    if memoryLimit != 0 {
        if s.Windows.Resources == nil {
            s.Windows.Resources = &specs.WindowsResources{}
        }
        s.Windows.Resources.Memory = &specs.WindowsMemoryResources{
            Limit: &memoryLimit,
        }
    }

    if c.HostConfig.IOMaximumBandwidth != 0 || c.HostConfig.IOMaximumIOps != 0 {
        if s.Windows.Resources == nil {
            s.Windows.Resources = &specs.WindowsResources{}
        }
        s.Windows.Resources.Storage = &specs.WindowsStorageResources{
            Bps:  &c.HostConfig.IOMaximumBandwidth,
            Iops: &c.HostConfig.IOMaximumIOps,
        }
    }
}

// mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
// It will do nothing on non-Linux platform
func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig, daemonCfg *config.Config) {
    return
}

// registryKey is an interface wrapper around `registry.Key`,
// listing only the methods we care about here.
// It's mainly useful to easily allow mocking the registry in tests.
type registryKey interface {
    GetStringValue(name string) (val string, valtype uint32, err error)
    Close() error
}

var registryOpenKeyFunc = func(baseKey registry.Key, path string, access uint32) (registryKey, error) {
    return registry.OpenKey(baseKey, path, access)
}

// readCredentialSpecRegistry is a helper function to read a credential spec from
// the registry. If not found, we return an empty string and warn in the log.
// This allows for staging on machines which do not have the necessary components.
func readCredentialSpecRegistry(id, name string) (string, error) {
    key, err := registryOpenKeyFunc(registry.LOCAL_MACHINE, credentialSpecRegistryLocation, registry.QUERY_VALUE)
    if err != nil {
        return "", errors.Wrapf(err, "failed handling spec %q for container %s - registry key %s could not be opened", name, id, credentialSpecRegistryLocation)
    }
    defer key.Close()

    value, _, err := key.GetStringValue(name)
    if err != nil {
        if err == registry.ErrNotExist {
            return "", fmt.Errorf("registry credential spec %q for container %s was not found", name, id)
        }
        return "", errors.Wrapf(err, "error reading credential spec %q from registry for container %s", name, id)
    }

    return value, nil
}

// readCredentialSpecFile is a helper function to read a credential spec from
// a file. If not found, we return an empty string and warn in the log.
// This allows for staging on machines which do not have the necessary components.
func readCredentialSpecFile(id, root, location string) (string, error) {
    if filepath.IsAbs(location) {
        return "", fmt.Errorf("invalid credential spec: file:// path cannot be absolute")
    }
    base := filepath.Join(root, credentialSpecFileLocation)
    full := filepath.Join(base, location)
    if !strings.HasPrefix(full, base) {
        return "", fmt.Errorf("invalid credential spec: file:// path must be under %s", base)
    }
    bcontents, err := os.ReadFile(full)
    if err != nil {
        return "", errors.Wrapf(err, "failed to load credential spec for container %s", id)
    }
    return string(bcontents[:]), nil
}

func setupWindowsDevices(devices []containertypes.DeviceMapping) (specDevices []specs.WindowsDevice, err error) {
    for _, deviceMapping := range devices {
        if strings.HasPrefix(deviceMapping.PathOnHost, "class/") {
            specDevices = append(specDevices, specs.WindowsDevice{
                ID:     strings.TrimPrefix(deviceMapping.PathOnHost, "class/"),
                IDType: "class",
            })
        } else {
            idType, id, ok := strings.Cut(deviceMapping.PathOnHost, "://")
            if !ok {
                return nil, errors.Errorf("invalid device assignment path: '%s', must be 'class/ID' or 'IDType://ID'", deviceMapping.PathOnHost)
            }
            if idType == "" {
                return nil, errors.Errorf("invalid device assignment path: '%s', IDType cannot be empty", deviceMapping.PathOnHost)
            }
            specDevices = append(specDevices, specs.WindowsDevice{
                ID:     id,
                IDType: idType,
            })
        }
    }

    return specDevices, nil
}