manager/scheduler/scheduler.go from docker/swarmkit

manager/scheduler/scheduler.go
Summary

Maintainability

4 days
Test Coverage

Issues
package scheduler

import (
    "context"
    "sync"
    "time"

    "github.com/moby/swarmkit/v2/api"
    "github.com/moby/swarmkit/v2/api/genericresource"
    "github.com/moby/swarmkit/v2/log"
    "github.com/moby/swarmkit/v2/manager/state"
    "github.com/moby/swarmkit/v2/manager/state/store"
    "github.com/moby/swarmkit/v2/protobuf/ptypes"
)

const (
    // monitorFailures is the lookback period for counting failures of
    // a task to determine if a node is faulty for a particular service.
    monitorFailures = 5 * time.Minute

    // maxFailures is the number of failures within monitorFailures that
    // triggers downweighting of a node in the sorting function.
    maxFailures = 5
)

type schedulingDecision struct {
    old *api.Task
    new *api.Task
}

// Scheduler assigns tasks to nodes.
type Scheduler struct {
    store           *store.MemoryStore
    unassignedTasks map[string]*api.Task
    // pendingPreassignedTasks already have NodeID, need resource validation
    pendingPreassignedTasks map[string]*api.Task
    // preassignedTasks tracks tasks that were preassigned, including those
    // past the pending state.
    preassignedTasks map[string]struct{}
    nodeSet          nodeSet
    allTasks         map[string]*api.Task
    pipeline         *Pipeline
    volumes          *volumeSet

    // stopOnce is a sync.Once used to ensure that Stop is idempotent
    stopOnce sync.Once
    // stopChan signals to the state machine to stop running
    stopChan chan struct{}
    // doneChan is closed when the state machine terminates
    doneChan chan struct{}
}

// New creates a new scheduler.
func New(store *store.MemoryStore) *Scheduler {
    return &Scheduler{
        store:                   store,
        unassignedTasks:         make(map[string]*api.Task),
        pendingPreassignedTasks: make(map[string]*api.Task),
        preassignedTasks:        make(map[string]struct{}),
        allTasks:                make(map[string]*api.Task),
        stopChan:                make(chan struct{}),
        doneChan:                make(chan struct{}),
        pipeline:                NewPipeline(),
        volumes:                 newVolumeSet(),
    }
}

func (s *Scheduler) setupTasksList(tx store.ReadTx) error {
    // add all volumes that are ready to the volumeSet
    volumes, err := store.FindVolumes(tx, store.All)
    if err != nil {
        return err
    }

    for _, volume := range volumes {
        // only add volumes that have been created, meaning they have a
        // VolumeID.
        if volume.VolumeInfo != nil && volume.VolumeInfo.VolumeID != "" {
            s.volumes.addOrUpdateVolume(volume)
        }
    }

    tasks, err := store.FindTasks(tx, store.All)
    if err != nil {
        return err
    }

    tasksByNode := make(map[string]map[string]*api.Task)
    for _, t := range tasks {
        // Ignore all tasks that have not reached PENDING
        // state and tasks that no longer consume resources.
        if t.Status.State < api.TaskStatePending || t.Status.State > api.TaskStateRunning {
            continue
        }

        // Also ignore tasks that have not yet been assigned but desired state
        // is beyond TaskStateCompleted. This can happen if you update, delete
        // or scale down a service before its tasks were assigned.
        if t.Status.State == api.TaskStatePending && t.DesiredState > api.TaskStateCompleted {
            continue
        }

        s.allTasks[t.ID] = t
        if t.NodeID == "" {
            s.enqueue(t)
            continue
        }
        // preassigned tasks need to validate resource requirement on corresponding node
        if t.Status.State == api.TaskStatePending {
            s.preassignedTasks[t.ID] = struct{}{}
            s.pendingPreassignedTasks[t.ID] = t
            continue
        }

        // track the volumes in use by the task
        s.volumes.reserveTaskVolumes(t)

        if tasksByNode[t.NodeID] == nil {
            tasksByNode[t.NodeID] = make(map[string]*api.Task)
        }
        tasksByNode[t.NodeID][t.ID] = t
    }

    return s.buildNodeSet(tx, tasksByNode)
}

// Run is the scheduler event loop.
func (s *Scheduler) Run(pctx context.Context) error {
    ctx := log.WithModule(pctx, "scheduler")
    defer close(s.doneChan)

    s.pipeline.AddFilter(&VolumesFilter{vs: s.volumes})

    updates, cancel, err := store.ViewAndWatch(s.store, s.setupTasksList)
    if err != nil {
        log.G(ctx).WithError(err).Errorf("snapshot store update failed")
        return err
    }
    defer cancel()

    // Validate resource for tasks from preassigned tasks
    // do this before other tasks because preassigned tasks like
    // global service should start before other tasks
    s.processPreassignedTasks(ctx)

    // Queue all unassigned tasks before processing changes.
    s.tick(ctx)

    const (
        // commitDebounceGap is the amount of time to wait between
        // commit events to debounce them.
        commitDebounceGap = 50 * time.Millisecond
        // maxLatency is a time limit on the debouncing.
        maxLatency = time.Second
    )
    var (
        debouncingStarted     time.Time
        commitDebounceTimer   *time.Timer
        commitDebounceTimeout <-chan time.Time
    )

    tickRequired := false

    schedule := func() {
        if len(s.pendingPreassignedTasks) > 0 {
            s.processPreassignedTasks(ctx)
        }
        if tickRequired {
            s.tick(ctx)
            tickRequired = false
        }
    }

    // Watch for changes.
    for {
        select {
        case event := <-updates:
            switch v := event.(type) {
            case api.EventCreateTask:
                if s.createTask(ctx, v.Task) {
                    tickRequired = true
                }
            case api.EventUpdateTask:
                if s.updateTask(ctx, v.Task) {
                    tickRequired = true
                }
            case api.EventDeleteTask:
                if s.deleteTask(v.Task) {
                    // deleting tasks may free up node resource, pending tasks should be re-evaluated.
                    tickRequired = true
                }
            case api.EventCreateNode:
                s.createOrUpdateNode(v.Node)
                tickRequired = true
            case api.EventUpdateNode:
                s.createOrUpdateNode(v.Node)
                tickRequired = true
            case api.EventDeleteNode:
                s.nodeSet.remove(v.Node.ID)
            case api.EventUpdateVolume:
                // there is no need for a EventCreateVolume case, because
                // volumes are not ready to use until they've passed through
                // the volume manager and been created with the plugin
                //
                // as such, only addOrUpdateVolume if the VolumeInfo exists and
                // has a nonempty VolumeID
                if v.Volume.VolumeInfo != nil && v.Volume.VolumeInfo.VolumeID != "" {
                    // TODO(dperny): verify that updating volumes doesn't break
                    // scheduling
                    log.G(ctx).WithField("volume.id", v.Volume.ID).Debug("updated volume")
                    s.volumes.addOrUpdateVolume(v.Volume)
                    tickRequired = true
                }
            case state.EventCommit:
                if commitDebounceTimer != nil {
                    if time.Since(debouncingStarted) > maxLatency {
                        commitDebounceTimer.Stop()
                        commitDebounceTimer = nil
                        commitDebounceTimeout = nil
                        schedule()
                    } else {
                        commitDebounceTimer.Reset(commitDebounceGap)
                    }
                } else {
                    commitDebounceTimer = time.NewTimer(commitDebounceGap)
                    commitDebounceTimeout = commitDebounceTimer.C
                    debouncingStarted = time.Now()
                }
            }
        case <-commitDebounceTimeout:
            schedule()
            commitDebounceTimer = nil
            commitDebounceTimeout = nil
        case <-s.stopChan:
            return nil
        }
    }
}

// Stop causes the scheduler event loop to stop running.
func (s *Scheduler) Stop() {
    // ensure stop is called only once. this helps in some test cases.
    s.stopOnce.Do(func() {
        close(s.stopChan)
    })
    <-s.doneChan
}

// enqueue queues a task for scheduling.
func (s *Scheduler) enqueue(t *api.Task) {
    s.unassignedTasks[t.ID] = t
}

func (s *Scheduler) createTask(ctx context.Context, t *api.Task) bool {
    // Ignore all tasks that have not reached PENDING
    // state, and tasks that no longer consume resources.
    if t.Status.State < api.TaskStatePending || t.Status.State > api.TaskStateRunning {
        return false
    }

    s.allTasks[t.ID] = t
    if t.NodeID == "" {
        // unassigned task
        s.enqueue(t)
        return true
    }

    if t.Status.State == api.TaskStatePending {
        s.preassignedTasks[t.ID] = struct{}{}
        s.pendingPreassignedTasks[t.ID] = t
        // preassigned tasks do not contribute to running tasks count
        return false
    }

    nodeInfo, err := s.nodeSet.nodeInfo(t.NodeID)
    if err == nil && nodeInfo.addTask(t) {
        s.nodeSet.updateNode(nodeInfo)
    }

    return false
}

func (s *Scheduler) updateTask(ctx context.Context, t *api.Task) bool {
    // Ignore all tasks that have not reached PENDING
    // state.
    if t.Status.State < api.TaskStatePending {
        return false
    }

    oldTask := s.allTasks[t.ID]

    // Ignore all tasks that have not reached Pending
    // state, and tasks that no longer consume resources.
    if t.Status.State > api.TaskStateRunning {
        if oldTask == nil {
            return false
        }

        if t.Status.State != oldTask.Status.State &&
            (t.Status.State == api.TaskStateFailed || t.Status.State == api.TaskStateRejected) {
            // Keep track of task failures, so other nodes can be preferred
            // for scheduling this service if it looks like the service is
            // failing in a loop on this node. However, skip this for
            // preassigned tasks, because the scheduler does not choose
            // which nodes those run on.
            if _, wasPreassigned := s.preassignedTasks[t.ID]; !wasPreassigned {
                nodeInfo, err := s.nodeSet.nodeInfo(t.NodeID)
                if err == nil {
                    nodeInfo.taskFailed(ctx, t)
                    s.nodeSet.updateNode(nodeInfo)
                }
            }
        }

        s.deleteTask(oldTask)

        return true
    }

    if t.NodeID == "" {
        // unassigned task
        if oldTask != nil {
            s.deleteTask(oldTask)
        }
        s.allTasks[t.ID] = t
        s.enqueue(t)
        return true
    }

    if t.Status.State == api.TaskStatePending {
        if oldTask != nil {
            s.deleteTask(oldTask)
        }
        s.preassignedTasks[t.ID] = struct{}{}
        s.allTasks[t.ID] = t
        s.pendingPreassignedTasks[t.ID] = t
        // preassigned tasks do not contribute to running tasks count
        return false
    }

    s.allTasks[t.ID] = t
    nodeInfo, err := s.nodeSet.nodeInfo(t.NodeID)
    if err == nil && nodeInfo.addTask(t) {
        s.nodeSet.updateNode(nodeInfo)
    }

    return false
}

func (s *Scheduler) deleteTask(t *api.Task) bool {
    delete(s.allTasks, t.ID)
    delete(s.preassignedTasks, t.ID)
    delete(s.pendingPreassignedTasks, t.ID)

    // remove the task volume reservations as well, if any
    for _, attachment := range t.Volumes {
        s.volumes.releaseVolume(attachment.ID, t.ID)
    }

    nodeInfo, err := s.nodeSet.nodeInfo(t.NodeID)
    if err == nil && nodeInfo.removeTask(t) {
        s.nodeSet.updateNode(nodeInfo)
        return true
    }
    return false
}

func (s *Scheduler) createOrUpdateNode(n *api.Node) {
    nodeInfo, nodeInfoErr := s.nodeSet.nodeInfo(n.ID)
    var resources *api.Resources
    if n.Description != nil && n.Description.Resources != nil {
        resources = n.Description.Resources.Copy()
        // reconcile resources by looping over all tasks in this node
        if nodeInfoErr == nil {
            for _, task := range nodeInfo.Tasks {
                reservations := taskReservations(task.Spec)

                resources.MemoryBytes -= reservations.MemoryBytes
                resources.NanoCPUs -= reservations.NanoCPUs

                genericresource.ConsumeNodeResources(&resources.Generic,
                    task.AssignedGenericResources)
            }
        }
    } else {
        resources = &api.Resources{}
    }

    if nodeInfoErr != nil {
        nodeInfo = newNodeInfo(n, nil, *resources)
    } else {
        nodeInfo.Node = n
        nodeInfo.AvailableResources = resources
    }
    s.nodeSet.addOrUpdateNode(nodeInfo)
}

func (s *Scheduler) processPreassignedTasks(ctx context.Context) {
    schedulingDecisions := make(map[string]schedulingDecision, len(s.pendingPreassignedTasks))
    for _, t := range s.pendingPreassignedTasks {
        newT := s.taskFitNode(ctx, t, t.NodeID)
        if newT == nil {
            continue
        }
        schedulingDecisions[t.ID] = schedulingDecision{old: t, new: newT}
    }

    successful, failed := s.applySchedulingDecisions(ctx, schedulingDecisions)

    for _, decision := range successful {
        if decision.new.Status.State == api.TaskStateAssigned {
            delete(s.pendingPreassignedTasks, decision.old.ID)
        }
    }
    for _, decision := range failed {
        s.allTasks[decision.old.ID] = decision.old
        nodeInfo, err := s.nodeSet.nodeInfo(decision.new.NodeID)
        if err == nil && nodeInfo.removeTask(decision.new) {
            s.nodeSet.updateNode(nodeInfo)
        }

        for _, va := range decision.new.Volumes {
            s.volumes.releaseVolume(va.ID, decision.new.ID)
        }
    }
}

// tick attempts to schedule the queue.
func (s *Scheduler) tick(ctx context.Context) {
    type commonSpecKey struct {
        serviceID   string
        specVersion api.Version
    }
    tasksByCommonSpec := make(map[commonSpecKey]map[string]*api.Task)
    var oneOffTasks []*api.Task
    schedulingDecisions := make(map[string]schedulingDecision, len(s.unassignedTasks))

    for taskID, t := range s.unassignedTasks {
        if t == nil || t.NodeID != "" {
            // task deleted or already assigned
            delete(s.unassignedTasks, taskID)
            continue
        }

        // Group tasks with common specs
        if t.SpecVersion != nil {
            taskGroupKey := commonSpecKey{
                serviceID:   t.ServiceID,
                specVersion: *t.SpecVersion,
            }

            if tasksByCommonSpec[taskGroupKey] == nil {
                tasksByCommonSpec[taskGroupKey] = make(map[string]*api.Task)
            }
            tasksByCommonSpec[taskGroupKey][taskID] = t
        } else {
            // This task doesn't have a spec version. We have to
            // schedule it as a one-off.
            oneOffTasks = append(oneOffTasks, t)
        }
        delete(s.unassignedTasks, taskID)
    }

    for _, taskGroup := range tasksByCommonSpec {
        s.scheduleTaskGroup(ctx, taskGroup, schedulingDecisions)
    }
    for _, t := range oneOffTasks {
        s.scheduleTaskGroup(ctx, map[string]*api.Task{t.ID: t}, schedulingDecisions)
    }

    _, failed := s.applySchedulingDecisions(ctx, schedulingDecisions)
    for _, decision := range failed {
        s.allTasks[decision.old.ID] = decision.old

        nodeInfo, err := s.nodeSet.nodeInfo(decision.new.NodeID)
        if err == nil && nodeInfo.removeTask(decision.new) {
            s.nodeSet.updateNode(nodeInfo)
        }

        // release the volumes we tried to use
        for _, va := range decision.new.Volumes {
            s.volumes.releaseVolume(va.ID, decision.new.ID)
        }

        // enqueue task for next scheduling attempt
        s.enqueue(decision.old)
    }
}

func (s *Scheduler) applySchedulingDecisions(ctx context.Context, schedulingDecisions map[string]schedulingDecision) (successful, failed []schedulingDecision) {
    // applySchedulingDecisions is the only place where we make store
    // transactions in the scheduler. the scheduler is responsible for freeing
    // volumes that are no longer in use. this means that volumes should be
    // freed in this function. sometimes, there are no scheduling decisions to
    // be made, so we return early in the if statement below.
    //
    // however, in all cases, any activity that results in a tick could result
    // in needing volumes to be freed, even if nothing new is scheduled. this
    // freeing of volumes should always happen *after* all of the scheduling
    // decisions have been committed, hence the defer.
    defer s.store.Batch(s.volumes.freeVolumes)

    if len(schedulingDecisions) == 0 {
        return
    }

    successful = make([]schedulingDecision, 0, len(schedulingDecisions))

    // Apply changes to master store
    err := s.store.Batch(func(batch *store.Batch) error {
        for len(schedulingDecisions) > 0 {
            err := batch.Update(func(tx store.Tx) error {
                // Update exactly one task inside this Update
                // callback.
            taskLoop:
                for taskID, decision := range schedulingDecisions {
                    delete(schedulingDecisions, taskID)

                    t := store.GetTask(tx, taskID)
                    if t == nil {
                        // Task no longer exists
                        s.deleteTask(decision.new)
                        continue
                    }

                    if t.Status.State == decision.new.Status.State &&
                        t.Status.Message == decision.new.Status.Message &&
                        t.Status.Err == decision.new.Status.Err {
                        // No changes, ignore
                        continue
                    }

                    if t.Status.State >= api.TaskStateAssigned {
                        nodeInfo, err := s.nodeSet.nodeInfo(decision.new.NodeID)
                        if err != nil {
                            failed = append(failed, decision)
                            continue
                        }
                        node := store.GetNode(tx, decision.new.NodeID)
                        if node == nil || node.Meta.Version != nodeInfo.Meta.Version {
                            // node is out of date
                            failed = append(failed, decision)
                            continue
                        }
                    }

                    volumes := []*api.Volume{}
                    for _, va := range decision.new.Volumes {
                        v := store.GetVolume(tx, va.ID)
                        if v == nil {
                            log.G(ctx).Debugf(
                                "scheduler failed to update task %s because volume %s could not be found",
                                taskID,
                                va.ID,
                            )
                            failed = append(failed, decision)
                            continue taskLoop
                        }

                        // it's ok if the copy of the Volume we scheduled off
                        // of is out of date, because the Scheduler is the only
                        // component which add new uses of a particular Volume,
                        // which means that in most cases, no update to the
                        // volume could conflict with the copy the Scheduler
                        // used to make decisions.
                        //
                        // the exception is that the VolumeAvailability could
                        // have been changed. both Pause and Drain
                        // availabilities mean the Volume should not be
                        // scheduled, and so we call off our attempt to commit
                        // this scheduling decision. this is the only field we
                        // must check for conflicts.
                        //
                        // this is, additionally, the reason that a Volume must
                        // be set to Drain before it can be deleted. it stops
                        // us from having to worry about any other field when
                        // attempting to use the Volume.
                        if v.Spec.Availability != api.VolumeAvailabilityActive {
                            log.G(ctx).Debugf(
                                "scheduler failed to update task %s because volume %s has availability %s",
                                taskID, v.ID, v.Spec.Availability.String(),
                            )
                            failed = append(failed, decision)
                            continue taskLoop
                        }

                        alreadyPublished := false
                        for _, ps := range v.PublishStatus {
                            if ps.NodeID == decision.new.NodeID {
                                alreadyPublished = true
                                break
                            }
                        }
                        if !alreadyPublished {
                            v.PublishStatus = append(
                                v.PublishStatus,
                                &api.VolumePublishStatus{
                                    NodeID: decision.new.NodeID,
                                    State:  api.VolumePublishStatus_PENDING_PUBLISH,
                                },
                            )
                            volumes = append(volumes, v)
                        }
                    }

                    if err := store.UpdateTask(tx, decision.new); err != nil {
                        log.G(ctx).Debugf("scheduler failed to update task %s; will retry", taskID)
                        failed = append(failed, decision)
                        continue
                    }
                    for _, v := range volumes {
                        if err := store.UpdateVolume(tx, v); err != nil {
                            // TODO(dperny): handle the case of a partial
                            // update?
                            log.G(ctx).WithError(err).Debugf(
                                "scheduler failed to update task %v; volume %v could not be updated",
                                taskID, v.ID,
                            )
                            failed = append(failed, decision)
                            continue taskLoop
                        }
                    }
                    successful = append(successful, decision)
                    return nil
                }
                return nil
            })
            if err != nil {
                return err
            }
        }
        // finally, every time we make new scheduling decisions, take the
        // opportunity to release volumes.
        return nil
    })

    if err != nil {
        log.G(ctx).WithError(err).Error("scheduler tick transaction failed")
        failed = append(failed, successful...)
        successful = nil
    }
    return
}

// taskFitNode checks if a node has enough resources to accommodate a task.
func (s *Scheduler) taskFitNode(ctx context.Context, t *api.Task, nodeID string) *api.Task {
    nodeInfo, err := s.nodeSet.nodeInfo(nodeID)
    if err != nil {
        // node does not exist in set (it may have been deleted)
        return nil
    }
    newT := *t
    s.pipeline.SetTask(t)
    if !s.pipeline.Process(&nodeInfo) {
        // this node cannot accommodate this task
        newT.Status.Timestamp = ptypes.MustTimestampProto(time.Now())
        newT.Status.Err = s.pipeline.Explain()
        s.allTasks[t.ID] = &newT

        return &newT
    }

    // before doing all of the updating logic, get the volume attachments
    // for the task on this node. this should always succeed, because we
    // should already have filtered nodes based on volume availability, but
    // just in case we missed something and it doesn't, we have an error
    // case.
    attachments, err := s.volumes.chooseTaskVolumes(t, &nodeInfo)
    if err != nil {
        newT.Status.Timestamp = ptypes.MustTimestampProto(time.Now())
        newT.Status.Err = err.Error()
        s.allTasks[t.ID] = &newT

        return &newT
    }

    newT.Volumes = attachments

    newT.Status = api.TaskStatus{
        State:     api.TaskStateAssigned,
        Timestamp: ptypes.MustTimestampProto(time.Now()),
        Message:   "scheduler confirmed task can run on preassigned node",
    }
    s.allTasks[t.ID] = &newT

    if nodeInfo.addTask(&newT) {
        s.nodeSet.updateNode(nodeInfo)
    }
    return &newT
}

// scheduleTaskGroup schedules a batch of tasks that are part of the same
// service and share the same version of the spec.
func (s *Scheduler) scheduleTaskGroup(ctx context.Context, taskGroup map[string]*api.Task, schedulingDecisions map[string]schedulingDecision) {
    // Pick at task at random from taskGroup to use for constraint
    // evaluation. It doesn't matter which one we pick because all the
    // tasks in the group are equal in terms of the fields the constraint
    // filters consider.
    var t *api.Task
    for _, t = range taskGroup {
        break
    }

    s.pipeline.SetTask(t)

    now := time.Now()

    nodeLess := func(a *NodeInfo, b *NodeInfo) bool {
        // If either node has at least maxFailures recent failures,
        // that's the deciding factor.
        recentFailuresA := a.countRecentFailures(now, t)
        recentFailuresB := b.countRecentFailures(now, t)

        if recentFailuresA >= maxFailures || recentFailuresB >= maxFailures {
            if recentFailuresA > recentFailuresB {
                return false
            }
            if recentFailuresB > recentFailuresA {
                return true
            }
        }

        tasksByServiceA := a.ActiveTasksCountByService[t.ServiceID]
        tasksByServiceB := b.ActiveTasksCountByService[t.ServiceID]

        if tasksByServiceA < tasksByServiceB {
            return true
        }
        if tasksByServiceA > tasksByServiceB {
            return false
        }

        // Total number of tasks breaks ties.
        return a.ActiveTasksCount < b.ActiveTasksCount
    }

    var prefs []*api.PlacementPreference
    if t.Spec.Placement != nil {
        prefs = t.Spec.Placement.Preferences
    }

    tree := s.nodeSet.tree(t.ServiceID, prefs, len(taskGroup), s.pipeline.Process, nodeLess)

    s.scheduleNTasksOnSubtree(ctx, len(taskGroup), taskGroup, &tree, schedulingDecisions, nodeLess)
    if len(taskGroup) != 0 {
        s.noSuitableNode(ctx, taskGroup, schedulingDecisions)
    }
}

// scheduleNTasksOnSubtree schedules a set of tasks with identical constraints
// onto a set of nodes, taking into account placement preferences.
//
// placement preferences are used to create a tree such that every branch
// represents one subset of nodes across which tasks should be spread.
//
// because of this tree structure, scheduleNTasksOnSubtree is a recursive
// function. If there are subtrees of the current tree, then we recurse. if we
// are at a leaf node, past which there are no subtrees, then we try to
// schedule a proportional number of tasks to the nodes of that branch.
//
//   - n is the number of tasks being scheduled on this subtree
//   - taskGroup is a set of tasks to schedule, taking the form of a map from the
//     task ID to the task object.
//   - tree is the decision tree we're scheduling on. this is, effectively, the
//     set of nodes that meet scheduling constraints. these nodes are arranged
//     into a tree so that placement preferences can be taken into account when
//     spreading tasks across nodes.
//   - schedulingDecisions is a set of the scheduling decisions already made for
//     this tree
//   - nodeLess is a comparator that chooses which of the two nodes is preferable
//     to schedule on.
func (s *Scheduler) scheduleNTasksOnSubtree(ctx context.Context, n int, taskGroup map[string]*api.Task, tree *decisionTree, schedulingDecisions map[string]schedulingDecision, nodeLess func(a *NodeInfo, b *NodeInfo) bool) int {
    if tree.next == nil {
        nodes := tree.orderedNodes(s.pipeline.Process, nodeLess)
        if len(nodes) == 0 {
            return 0
        }

        return s.scheduleNTasksOnNodes(ctx, n, taskGroup, nodes, schedulingDecisions, nodeLess)
    }

    // Walk the tree and figure out how the tasks should be split at each
    // level.
    tasksScheduled := 0
    tasksInUsableBranches := tree.tasks
    var noRoom map[*decisionTree]struct{}

    // Try to make branches even until either all branches are
    // full, or all tasks have been scheduled.
    for tasksScheduled != n && len(noRoom) != len(tree.next) {
        desiredTasksPerBranch := (tasksInUsableBranches + n - tasksScheduled) / (len(tree.next) - len(noRoom))
        remainder := (tasksInUsableBranches + n - tasksScheduled) % (len(tree.next) - len(noRoom))

        for _, subtree := range tree.next {
            if noRoom != nil {
                if _, ok := noRoom[subtree]; ok {
                    continue
                }
            }
            subtreeTasks := subtree.tasks
            if subtreeTasks < desiredTasksPerBranch || (subtreeTasks == desiredTasksPerBranch && remainder > 0) {
                tasksToAssign := desiredTasksPerBranch - subtreeTasks
                if remainder > 0 {
                    tasksToAssign++
                }
                res := s.scheduleNTasksOnSubtree(ctx, tasksToAssign, taskGroup, subtree, schedulingDecisions, nodeLess)
                if res < tasksToAssign {
                    if noRoom == nil {
                        noRoom = make(map[*decisionTree]struct{})
                    }
                    noRoom[subtree] = struct{}{}
                    tasksInUsableBranches -= subtreeTasks
                } else if remainder > 0 {
                    remainder--
                }
                tasksScheduled += res
            }
        }
    }

    return tasksScheduled
}

// scheduleNTasksOnNodes schedules some number of tasks on the set of provided
// nodes. The number of tasks being scheduled may be less than the total number
// of tasks, as the Nodes may be one branch of a tree used to spread tasks.
//
// returns the number of tasks actually scheduled to these nodes. this may be
// fewer than the number of tasks desired to be scheduled, if there are
// insufficient nodes to meet resource constraints.
//
//   - n is the number of tasks desired to be scheduled to this set of nodes
//   - taskGroup is the tasks desired to be scheduled, in the form of a map from
//     task ID to task object. this argument is mutated; tasks which have been
//     scheduled are removed from the map.
//   - nodes is the set of nodes to schedule to
//   - schedulingDecisions is the set of scheduling decisions that have been made
//     thus far, in the form of a map from task ID to the decision made.
//   - nodeLess is a simple comparator that chooses which of two nodes would be
//     preferable to schedule on.
func (s *Scheduler) scheduleNTasksOnNodes(ctx context.Context, n int, taskGroup map[string]*api.Task, nodes []NodeInfo, schedulingDecisions map[string]schedulingDecision, nodeLess func(a *NodeInfo, b *NodeInfo) bool) int {
    tasksScheduled := 0
    failedConstraints := make(map[int]bool) // key is index in nodes slice
    nodeIter := 0
    nodeCount := len(nodes)
    for taskID, t := range taskGroup {
        // Skip tasks which were already scheduled because they ended
        // up in two groups at once.
        if _, exists := schedulingDecisions[taskID]; exists {
            continue
        }

        node := &nodes[nodeIter%nodeCount]
        // before doing all of the updating logic, get the volume attachments
        // for the task on this node. this should always succeed, because we
        // should already have filtered nodes based on volume availability, but
        // just in case we missed something and it doesn't, we have an error
        // case.
        attachments, err := s.volumes.chooseTaskVolumes(t, node)
        if err != nil {
            // TODO(dperny) if there's an error, then what? i'm frankly not
            // sure.
            log.G(ctx).WithField("task.id", t.ID).WithError(err).Error("could not find task volumes")
        }

        log.G(ctx).WithField("task.id", t.ID).Debugf("assigning to node %s", node.ID)
        // she turned me into a newT!
        newT := *t
        newT.Volumes = attachments
        newT.NodeID = node.ID
        s.volumes.reserveTaskVolumes(&newT)
        newT.Status = api.TaskStatus{
            State:     api.TaskStateAssigned,
            Timestamp: ptypes.MustTimestampProto(time.Now()),
            Message:   "scheduler assigned task to node",
        }
        s.allTasks[t.ID] = &newT

        // in each iteration of this loop, the node we choose will always be
        // one which meets constraints. at the end of each iteration, we
        // re-process nodes, allowing us to remove nodes which no longer meet
        // resource constraints.
        nodeInfo, err := s.nodeSet.nodeInfo(node.ID)
        if err == nil && nodeInfo.addTask(&newT) {
            s.nodeSet.updateNode(nodeInfo)
            nodes[nodeIter%nodeCount] = nodeInfo
        }

        schedulingDecisions[taskID] = schedulingDecision{old: t, new: &newT}
        delete(taskGroup, taskID)
        tasksScheduled++
        if tasksScheduled == n {
            return tasksScheduled
        }

        if nodeIter+1 < nodeCount {
            // First pass fills the nodes until they have the same
            // number of tasks from this service.
            nextNode := nodes[(nodeIter+1)%nodeCount]
            if nodeLess(&nextNode, &nodeInfo) {
                nodeIter++
            }
        } else {
            // In later passes, we just assign one task at a time
            // to each node that still meets the constraints.
            nodeIter++
        }

        origNodeIter := nodeIter
        for failedConstraints[nodeIter%nodeCount] || !s.pipeline.Process(&nodes[nodeIter%nodeCount]) {
            failedConstraints[nodeIter%nodeCount] = true
            nodeIter++
            if nodeIter-origNodeIter == nodeCount {
                // None of the nodes meet the constraints anymore.
                return tasksScheduled
            }
        }
    }

    return tasksScheduled
}

// noSuitableNode checks unassigned tasks and make sure they have an existing service in the store before
// updating the task status and adding it back to: schedulingDecisions, unassignedTasks and allTasks
func (s *Scheduler) noSuitableNode(ctx context.Context, taskGroup map[string]*api.Task, schedulingDecisions map[string]schedulingDecision) {
    explanation := s.pipeline.Explain()
    for _, t := range taskGroup {
        var service *api.Service
        s.store.View(func(tx store.ReadTx) {
            service = store.GetService(tx, t.ServiceID)
        })
        if service == nil {
            log.G(ctx).WithField("task.id", t.ID).Debug("removing task from the scheduler")
            continue
        }

        log.G(ctx).WithField("task.id", t.ID).Debug("no suitable node available for task")

        newT := *t
        newT.Status.Timestamp = ptypes.MustTimestampProto(time.Now())
        sv := service.SpecVersion
        tv := newT.SpecVersion
        if sv != nil && tv != nil && sv.Index > tv.Index {
            log.G(ctx).WithField("task.id", t.ID).Debug(
                "task belongs to old revision of service",
            )
            if t.Status.State == api.TaskStatePending && t.DesiredState >= api.TaskStateShutdown {
                log.G(ctx).WithField("task.id", t.ID).Debug(
                    "task is desired shutdown, scheduler will go ahead and do so",
                )
                newT.Status.State = api.TaskStateShutdown
                newT.Status.Err = ""
            }
        } else {
            if explanation != "" {
                newT.Status.Err = "no suitable node (" + explanation + ")"
            } else {
                newT.Status.Err = "no suitable node"
            }

            // re-enqueue a task that should still be attempted
            s.enqueue(&newT)
        }

        s.allTasks[t.ID] = &newT
        schedulingDecisions[t.ID] = schedulingDecision{old: t, new: &newT}
    }
}

func (s *Scheduler) buildNodeSet(tx store.ReadTx, tasksByNode map[string]map[string]*api.Task) error {
    nodes, err := store.FindNodes(tx, store.All)
    if err != nil {
        return err
    }

    s.nodeSet.alloc(len(nodes))

    for _, n := range nodes {
        var resources api.Resources
        if n.Description != nil && n.Description.Resources != nil {
            resources = *n.Description.Resources
        }
        s.nodeSet.addOrUpdateNode(newNodeInfo(n, tasksByNode[n.ID], resources))
    }

    return nil
}