dotcloud/docker

View on GitHub
daemon/nvidia_linux.go

Summary

Maintainability
A
0 mins
Test Coverage
package daemon

import (
    "os"
    "os/exec"
    "strconv"
    "strings"

    "github.com/containerd/containerd/contrib/nvidia"
    "github.com/docker/docker/daemon/internal/capabilities"
    specs "github.com/opencontainers/runtime-spec/specs-go"
    "github.com/pkg/errors"
)

// TODO: nvidia should not be hard-coded, and should be a device plugin instead on the daemon object.
// TODO: add list of device capabilities in daemon/node info

var errConflictCountDeviceIDs = errors.New("cannot set both Count and DeviceIDs on device request")

const nvidiaHook = "nvidia-container-runtime-hook"

// These are NVIDIA-specific capabilities stolen from github.com/containerd/containerd/contrib/nvidia.allCaps
var allNvidiaCaps = map[nvidia.Capability]struct{}{
    nvidia.Compute:  {},
    nvidia.Compat32: {},
    nvidia.Graphics: {},
    nvidia.Utility:  {},
    nvidia.Video:    {},
    nvidia.Display:  {},
}

func init() {
    if _, err := exec.LookPath(nvidiaHook); err != nil {
        // do not register Nvidia driver if helper binary is not present.
        return
    }
    capset := capabilities.Set{"gpu": struct{}{}, "nvidia": struct{}{}}
    nvidiaDriver := &deviceDriver{
        capset:     capset,
        updateSpec: setNvidiaGPUs,
    }
    for c := range allNvidiaCaps {
        nvidiaDriver.capset[string(c)] = struct{}{}
    }
    registerDeviceDriver("nvidia", nvidiaDriver)
}

func setNvidiaGPUs(s *specs.Spec, dev *deviceInstance) error {
    req := dev.req
    if req.Count != 0 && len(req.DeviceIDs) > 0 {
        return errConflictCountDeviceIDs
    }

    if len(req.DeviceIDs) > 0 {
        s.Process.Env = append(s.Process.Env, "NVIDIA_VISIBLE_DEVICES="+strings.Join(req.DeviceIDs, ","))
    } else if req.Count > 0 {
        s.Process.Env = append(s.Process.Env, "NVIDIA_VISIBLE_DEVICES="+countToDevices(req.Count))
    } else if req.Count < 0 {
        s.Process.Env = append(s.Process.Env, "NVIDIA_VISIBLE_DEVICES=all")
    }

    var nvidiaCaps []string
    // req.Capabilities contains device capabilities, some but not all are NVIDIA driver capabilities.
    for _, c := range dev.selectedCaps {
        nvcap := nvidia.Capability(c)
        if _, isNvidiaCap := allNvidiaCaps[nvcap]; isNvidiaCap {
            nvidiaCaps = append(nvidiaCaps, c)
            continue
        }
        // TODO: nvidia.WithRequiredCUDAVersion
        // for now we let the prestart hook verify cuda versions but errors are not pretty.
    }

    if nvidiaCaps != nil {
        s.Process.Env = append(s.Process.Env, "NVIDIA_DRIVER_CAPABILITIES="+strings.Join(nvidiaCaps, ","))
    }

    path, err := exec.LookPath(nvidiaHook)
    if err != nil {
        return err
    }

    if s.Hooks == nil {
        s.Hooks = &specs.Hooks{}
    }

    // This implementation uses prestart hooks, which are deprecated.
    // CreateRuntime is the closest equivalent, and executed in the same
    // locations as prestart-hooks, but depending on what these hooks do,
    // possibly one of the other hooks could be used instead (such as
    // CreateContainer or StartContainer).
    s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ //nolint:staticcheck // FIXME(thaJeztah); replace prestart hook with a non-deprecated one.
        Path: path,
        Args: []string{
            nvidiaHook,
            "prestart",
        },
        Env: os.Environ(),
    })

    return nil
}

// countToDevices returns the list 0, 1, ... count-1 of deviceIDs.
func countToDevices(count int) string {
    devices := make([]string, count)
    for i := range devices {
        devices[i] = strconv.Itoa(i)
    }
    return strings.Join(devices, ",")
}