dotcloud/docker

View on GitHub
libnetwork/drivers/overlay/ov_network.go

Summary

Maintainability
A
0 mins
Test Coverage
//go:build linux

package overlay

import (
    "context"
    "errors"
    "fmt"
    "net"
    "os"
    "path/filepath"
    "runtime"
    "strconv"
    "strings"
    "sync"

    "github.com/containerd/log"
    "github.com/docker/docker/libnetwork/driverapi"
    "github.com/docker/docker/libnetwork/drivers/overlay/overlayutils"
    "github.com/docker/docker/libnetwork/netlabel"
    "github.com/docker/docker/libnetwork/ns"
    "github.com/docker/docker/libnetwork/osl"
    "github.com/docker/docker/libnetwork/types"
    "github.com/hashicorp/go-multierror"
    "github.com/vishvananda/netlink"
    "github.com/vishvananda/netns"
    "golang.org/x/sys/unix"
)

var (
    networkOnce sync.Once
    networkMu   sync.Mutex
    vniTbl      = make(map[uint32]string)
)

type networkTable map[string]*network

type subnet struct {
    sboxInit  bool
    vxlanName string
    brName    string
    vni       uint32
    initErr   error
    subnetIP  *net.IPNet
    gwIP      *net.IPNet
}

type network struct {
    id        string
    sbox      *osl.Namespace
    endpoints endpointTable
    driver    *driver
    joinCnt   int
    sboxInit  bool
    initEpoch int
    initErr   error
    subnets   []*subnet
    secure    bool
    mtu       int
    sync.Mutex
}

func init() {
    // Lock main() to the initial thread to exclude the goroutines executing
    // func setDefaultVLAN() from being scheduled onto that thread. Changes to
    // the network namespace of the initial thread alter /proc/self/ns/net,
    // which would break any code which (incorrectly) assumes that /proc/self/ns/net
    // is a handle to the network namespace for the thread it is currently
    // executing on.
    runtime.LockOSThread()
}

func (d *driver) NetworkAllocate(id string, option map[string]string, ipV4Data, ipV6Data []driverapi.IPAMData) (map[string]string, error) {
    return nil, types.NotImplementedErrorf("not implemented")
}

func (d *driver) NetworkFree(id string) error {
    return types.NotImplementedErrorf("not implemented")
}

func (d *driver) CreateNetwork(id string, option map[string]interface{}, nInfo driverapi.NetworkInfo, ipV4Data, ipV6Data []driverapi.IPAMData) error {
    if id == "" {
        return fmt.Errorf("invalid network id")
    }
    if len(ipV4Data) == 0 || ipV4Data[0].Pool.String() == "0.0.0.0/0" {
        return types.InvalidParameterErrorf("ipv4 pool is empty")
    }

    // Since we perform lazy configuration make sure we try
    // configuring the driver when we enter CreateNetwork
    if err := d.configure(); err != nil {
        return err
    }

    n := &network{
        id:        id,
        driver:    d,
        endpoints: endpointTable{},
        subnets:   []*subnet{},
    }

    vnis := make([]uint32, 0, len(ipV4Data))
    gval, ok := option[netlabel.GenericData]
    if !ok {
        return fmt.Errorf("option %s is missing", netlabel.GenericData)
    }

    optMap := gval.(map[string]string)
    vnisOpt, ok := optMap[netlabel.OverlayVxlanIDList]
    if !ok {
        return errors.New("no VNI provided")
    }
    log.G(context.TODO()).Debugf("overlay: Received vxlan IDs: %s", vnisOpt)
    var err error
    vnis, err = overlayutils.AppendVNIList(vnis, vnisOpt)
    if err != nil {
        return err
    }

    if _, ok := optMap[secureOption]; ok {
        n.secure = true
    }
    if val, ok := optMap[netlabel.DriverMTU]; ok {
        var err error
        if n.mtu, err = strconv.Atoi(val); err != nil {
            return fmt.Errorf("failed to parse %v: %v", val, err)
        }
        if n.mtu < 0 {
            return fmt.Errorf("invalid MTU value: %v", n.mtu)
        }
    }

    if len(vnis) == 0 {
        return errors.New("no VNI provided")
    } else if len(vnis) < len(ipV4Data) {
        return fmt.Errorf("insufficient vnis(%d) passed to overlay", len(vnis))
    }

    for i, ipd := range ipV4Data {
        s := &subnet{
            subnetIP: ipd.Pool,
            gwIP:     ipd.Gateway,
            vni:      vnis[i],
        }

        n.subnets = append(n.subnets, s)
    }

    d.Lock()
    defer d.Unlock()
    if d.networks[n.id] != nil {
        return fmt.Errorf("attempt to create overlay network %v that already exists", n.id)
    }

    // Make sure no rule is on the way from any stale secure network
    if !n.secure {
        for _, vni := range vnis {
            d.programMangle(vni, false)
            d.programInput(vni, false)
        }
    }

    if nInfo != nil {
        if err := nInfo.TableEventRegister(ovPeerTable, driverapi.EndpointObject); err != nil {
            // XXX Undo writeToStore?  No method to so.  Why?
            return err
        }
    }

    d.networks[id] = n

    return nil
}

func (d *driver) DeleteNetwork(nid string) error {
    if nid == "" {
        return fmt.Errorf("invalid network id")
    }

    // Make sure driver resources are initialized before proceeding
    if err := d.configure(); err != nil {
        return err
    }

    d.Lock()
    // Only perform a peer flush operation (if required) AFTER unlocking
    // the driver lock to avoid deadlocking w/ the peerDB.
    var doPeerFlush bool
    defer func() {
        d.Unlock()
        if doPeerFlush {
            d.peerFlush(nid)
        }
    }()

    // This is similar to d.network(), but we need to keep holding the lock
    // until we are done removing this network.
    n := d.networks[nid]
    if n == nil {
        return fmt.Errorf("could not find network with id %s", nid)
    }

    for _, ep := range n.endpoints {
        if ep.ifName != "" {
            if link, err := ns.NlHandle().LinkByName(ep.ifName); err == nil {
                if err := ns.NlHandle().LinkDel(link); err != nil {
                    log.G(context.TODO()).WithError(err).Warnf("Failed to delete interface (%s)'s link on endpoint (%s) delete", ep.ifName, ep.id)
                }
            }
        }
    }

    doPeerFlush = true
    delete(d.networks, nid)

    if n.secure {
        for _, s := range n.subnets {
            if err := d.programMangle(s.vni, false); err != nil {
                log.G(context.TODO()).WithFields(log.Fields{
                    "error":      err,
                    "network_id": n.id,
                    "subnet":     s.subnetIP,
                }).Warn("Failed to clean up iptables rules during overlay network deletion")
            }
            if err := d.programInput(s.vni, false); err != nil {
                log.G(context.TODO()).WithFields(log.Fields{
                    "error":      err,
                    "network_id": n.id,
                    "subnet":     s.subnetIP,
                }).Warn("Failed to clean up iptables rules during overlay network deletion")
            }
        }
    }

    return nil
}

func (d *driver) ProgramExternalConnectivity(_ context.Context, nid, eid string, options map[string]interface{}) error {
    return nil
}

func (d *driver) RevokeExternalConnectivity(nid, eid string) error {
    return nil
}

func (n *network) joinSandbox(s *subnet, incJoinCount bool) error {
    // If there is a race between two go routines here only one will win
    // the other will wait.
    networkOnce.Do(populateVNITbl)

    n.Lock()
    // If initialization was successful then tell the peerDB to initialize the
    // sandbox with all the peers previously received from networkdb. But only
    // do this after unlocking the network. Otherwise we could deadlock with
    // on the peerDB channel while peerDB is waiting for the network lock.
    var doInitPeerDB bool
    defer func() {
        n.Unlock()
        if doInitPeerDB {
            go n.driver.initSandboxPeerDB(n.id)
        }
    }()

    if !n.sboxInit {
        n.initErr = n.initSandbox()
        doInitPeerDB = n.initErr == nil
        // If there was an error, we cannot recover it
        n.sboxInit = true
    }

    if n.initErr != nil {
        return fmt.Errorf("network sandbox join failed: %v", n.initErr)
    }

    subnetErr := s.initErr
    if !s.sboxInit {
        subnetErr = n.initSubnetSandbox(s)
        // We can recover from these errors
        if subnetErr == nil {
            s.initErr = subnetErr
            s.sboxInit = true
        }
    }
    if subnetErr != nil {
        return fmt.Errorf("subnet sandbox join failed for %q: %v", s.subnetIP.String(), subnetErr)
    }

    if incJoinCount {
        n.joinCnt++
    }

    return nil
}

func (n *network) leaveSandbox() {
    n.Lock()
    defer n.Unlock()
    n.joinCnt--
    if n.joinCnt != 0 {
        return
    }

    n.destroySandbox()

    n.sboxInit = false
    n.initErr = nil
    for _, s := range n.subnets {
        s.sboxInit = false
        s.initErr = nil
    }
}

// to be called while holding network lock
func (n *network) destroySandbox() {
    if n.sbox != nil {
        for _, iface := range n.sbox.Interfaces() {
            if err := iface.Remove(); err != nil {
                log.G(context.TODO()).Debugf("Remove interface %s failed: %v", iface.SrcName(), err)
            }
        }

        for _, s := range n.subnets {
            if s.vxlanName != "" {
                err := deleteInterface(s.vxlanName)
                if err != nil {
                    log.G(context.TODO()).Warnf("could not cleanup sandbox properly: %v", err)
                }
            }
        }

        n.sbox.Destroy()
        n.sbox = nil
    }
}

func populateVNITbl() {
    filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")),
        // NOTE(cpuguy83): The linter picked up on the fact that this walk function was not using this error argument
        // That seems wrong... however I'm not familiar with this code or if that error matters
        func(path string, _ os.DirEntry, _ error) error {
            _, fname := filepath.Split(path)

            if len(strings.Split(fname, "-")) <= 1 {
                return nil
            }

            n, err := netns.GetFromPath(path)
            if err != nil {
                log.G(context.TODO()).Errorf("Could not open namespace path %s during vni population: %v", path, err)
                return nil
            }
            defer n.Close()

            nlh, err := netlink.NewHandleAt(n, unix.NETLINK_ROUTE)
            if err != nil {
                log.G(context.TODO()).Errorf("Could not open netlink handle during vni population for ns %s: %v", path, err)
                return nil
            }
            defer nlh.Close()

            err = nlh.SetSocketTimeout(soTimeout)
            if err != nil {
                log.G(context.TODO()).Warnf("Failed to set the timeout on the netlink handle sockets for vni table population: %v", err)
            }

            links, err := nlh.LinkList()
            if err != nil {
                log.G(context.TODO()).Errorf("Failed to list interfaces during vni population for ns %s: %v", path, err)
                return nil
            }

            for _, l := range links {
                if l.Type() == "vxlan" {
                    vniTbl[uint32(l.(*netlink.Vxlan).VxlanId)] = path
                }
            }

            return nil
        })
}

func (n *network) generateVxlanName(s *subnet) string {
    id := n.id
    if len(n.id) > 5 {
        id = n.id[:5]
    }

    return fmt.Sprintf("vx-%06x-%v", s.vni, id)
}

func (n *network) generateBridgeName(s *subnet) string {
    id := n.id
    if len(n.id) > 5 {
        id = n.id[:5]
    }

    return n.getBridgeNamePrefix(s) + "-" + id
}

func (n *network) getBridgeNamePrefix(s *subnet) string {
    return fmt.Sprintf("ov-%06x", s.vni)
}

func (n *network) setupSubnetSandbox(s *subnet, brName, vxlanName string) error {
    // Try to find this subnet's vni is being used in some
    // other namespace by looking at vniTbl that we just
    // populated in the once init. If a hit is found then
    // it must a stale namespace from previous
    // life. Destroy it completely and reclaim resourced.
    networkMu.Lock()
    path, ok := vniTbl[s.vni]
    networkMu.Unlock()

    if ok {
        deleteVxlanByVNI(path, s.vni)
        if err := unix.Unmount(path, unix.MNT_FORCE); err != nil {
            log.G(context.TODO()).Errorf("unmount of %s failed: %v", path, err)
        }
        os.Remove(path)

        networkMu.Lock()
        delete(vniTbl, s.vni)
        networkMu.Unlock()
    }

    // create a bridge and vxlan device for this subnet and move it to the sandbox
    sbox := n.sbox

    if err := sbox.AddInterface(context.TODO(), brName, "br", osl.WithIPv4Address(s.gwIP), osl.WithIsBridge(true)); err != nil {
        return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err)
    }

    v6transport, err := n.driver.isIPv6Transport()
    if err != nil {
        log.G(context.TODO()).WithError(err).Errorf("Assuming IPv4 transport; overlay network %s will not pass traffic if the Swarm data plane is IPv6.", n.id)
    }
    if err := createVxlan(vxlanName, s.vni, n.maxMTU(), v6transport); err != nil {
        return err
    }

    if err := sbox.AddInterface(context.TODO(), vxlanName, "vxlan", osl.WithMaster(brName)); err != nil {
        // If adding vxlan device to the overlay namespace fails, remove the bridge interface we
        // already added to the namespace. This allows the caller to try the setup again.
        for _, iface := range sbox.Interfaces() {
            if iface.SrcName() == brName {
                if ierr := iface.Remove(); ierr != nil {
                    log.G(context.TODO()).Errorf("removing bridge failed from ov ns %v failed, %v", n.sbox.Key(), ierr)
                }
            }
        }

        // Also, delete the vxlan interface. Since a global vni id is associated
        // with the vxlan interface, an orphaned vxlan interface will result in
        // failure of vxlan device creation if the vni is assigned to some other
        // network.
        if deleteErr := deleteInterface(vxlanName); deleteErr != nil {
            log.G(context.TODO()).Warnf("could not delete vxlan interface, %s, error %v, after config error, %v", vxlanName, deleteErr, err)
        }
        return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err)
    }

    if err := setDefaultVLAN(sbox); err != nil {
        // not a fatal error
        log.G(context.TODO()).WithError(err).Error("set bridge default vlan failed")
    }
    return nil
}

func setDefaultVLAN(ns *osl.Namespace) error {
    var brName string
    for _, i := range ns.Interfaces() {
        if i.Bridge() {
            brName = i.DstName()
        }
    }

    // IFLA_BR_VLAN_DEFAULT_PVID was added in Linux v4.4 (see torvalds/linux@0f963b7), so we can't use netlink for
    // setting this until Docker drops support for CentOS/RHEL 7 (kernel 3.10, eol date: 2024-06-30).
    var innerErr error
    err := ns.InvokeFunc(func() {
        // Contrary to what the sysfs(5) man page says, the entries of /sys/class/net
        // represent the networking devices visible in the network namespace of the
        // process which mounted the sysfs filesystem, irrespective of the network
        // namespace of the process accessing the directory. Remount sysfs in order to
        // see the network devices in sbox's network namespace, making sure the mount
        // doesn't propagate back.
        //
        // The Linux implementation of (osl.Sandbox).InvokeFunc() runs the function in a
        // dedicated goroutine. The effects of unshare(CLONE_NEWNS) on a thread cannot
        // be reverted so the thread needs to be terminated once the goroutine is
        // finished.
        runtime.LockOSThread()
        if err := unix.Unshare(unix.CLONE_NEWNS); err != nil {
            innerErr = os.NewSyscallError("unshare", err)
            return
        }
        if err := unix.Mount("", "/", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
            innerErr = &os.PathError{Op: "mount", Path: "/", Err: err}
            return
        }
        if err := unix.Mount("sysfs", "/sys", "sysfs", 0, ""); err != nil {
            innerErr = &os.PathError{Op: "mount", Path: "/sys", Err: err}
            return
        }

        path := filepath.Join("/sys/class/net", brName, "bridge/default_pvid")
        data := []byte{'0', '\n'}

        if err := os.WriteFile(path, data, 0o644); err != nil {
            innerErr = fmt.Errorf("failed to enable default vlan on bridge %s: %w", brName, err)
            return
        }
    })
    if err != nil {
        return err
    }
    return innerErr
}

// Must be called with the network lock
func (n *network) initSubnetSandbox(s *subnet) error {
    brName := n.generateBridgeName(s)
    vxlanName := n.generateVxlanName(s)

    // Program iptables rules for mandatory encryption of the secure
    // network, or clean up leftover rules for a stale secure network which
    // was previously assigned the same VNI.
    if err := n.driver.programMangle(s.vni, n.secure); err != nil {
        return err
    }
    if err := n.driver.programInput(s.vni, n.secure); err != nil {
        if n.secure {
            return multierror.Append(err, n.driver.programMangle(s.vni, false))
        }
    }

    if err := n.setupSubnetSandbox(s, brName, vxlanName); err != nil {
        return err
    }

    s.vxlanName = vxlanName
    s.brName = brName

    return nil
}

func (n *network) cleanupStaleSandboxes() {
    filepath.WalkDir(filepath.Dir(osl.GenerateKey("walk")),
        func(path string, _ os.DirEntry, _ error) error {
            _, fname := filepath.Split(path)

            pList := strings.Split(fname, "-")
            if len(pList) <= 1 {
                return nil
            }

            pattern := pList[1]
            if strings.Contains(n.id, pattern) {
                // Delete all vnis
                deleteVxlanByVNI(path, 0)
                unix.Unmount(path, unix.MNT_DETACH)
                os.Remove(path)

                // Now that we have destroyed this
                // sandbox, remove all references to
                // it in vniTbl so that we don't
                // inadvertently destroy the sandbox
                // created in this life.
                networkMu.Lock()
                for vni, tblPath := range vniTbl {
                    if tblPath == path {
                        delete(vniTbl, vni)
                    }
                }
                networkMu.Unlock()
            }

            return nil
        })
}

func (n *network) initSandbox() error {
    n.initEpoch++

    // If there are any stale sandboxes related to this network
    // from previous daemon life clean it up here
    n.cleanupStaleSandboxes()

    key := osl.GenerateKey(fmt.Sprintf("%d-", n.initEpoch) + n.id)
    sbox, err := osl.NewSandbox(key, true, false)
    if err != nil {
        return fmt.Errorf("could not get network sandbox: %v", err)
    }

    // this is needed to let the peerAdd configure the sandbox
    n.sbox = sbox

    return nil
}

func (d *driver) network(nid string) *network {
    d.Lock()
    n := d.networks[nid]
    d.Unlock()

    return n
}

func (n *network) sandbox() *osl.Namespace {
    n.Lock()
    defer n.Unlock()
    return n.sbox
}

// getSubnetforIP returns the subnet to which the given IP belongs
func (n *network) getSubnetforIP(ip *net.IPNet) *subnet {
    for _, s := range n.subnets {
        // first check if the mask lengths are the same
        i, _ := s.subnetIP.Mask.Size()
        j, _ := ip.Mask.Size()
        if i != j {
            continue
        }
        if s.subnetIP.Contains(ip.IP) {
            return s
        }
    }
    return nil
}