github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/scheduler/scheduler.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"context"
     5  	"time"
     6  
     7  	"github.com/docker/swarmkit/api"
     8  	"github.com/docker/swarmkit/api/genericresource"
     9  	"github.com/docker/swarmkit/log"
    10  	"github.com/docker/swarmkit/manager/state"
    11  	"github.com/docker/swarmkit/manager/state/store"
    12  	"github.com/docker/swarmkit/protobuf/ptypes"
    13  )
    14  
    15  const (
    16  	// monitorFailures is the lookback period for counting failures of
    17  	// a task to determine if a node is faulty for a particular service.
    18  	monitorFailures = 5 * time.Minute
    19  
    20  	// maxFailures is the number of failures within monitorFailures that
    21  	// triggers downweighting of a node in the sorting function.
    22  	maxFailures = 5
    23  )
    24  
    25  type schedulingDecision struct {
    26  	old *api.Task
    27  	new *api.Task
    28  }
    29  
    30  // Scheduler assigns tasks to nodes.
    31  type Scheduler struct {
    32  	store           *store.MemoryStore
    33  	unassignedTasks map[string]*api.Task
    34  	// pendingPreassignedTasks already have NodeID, need resource validation
    35  	pendingPreassignedTasks map[string]*api.Task
    36  	// preassignedTasks tracks tasks that were preassigned, including those
    37  	// past the pending state.
    38  	preassignedTasks map[string]struct{}
    39  	nodeSet          nodeSet
    40  	allTasks         map[string]*api.Task
    41  	pipeline         *Pipeline
    42  
    43  	// stopChan signals to the state machine to stop running
    44  	stopChan chan struct{}
    45  	// doneChan is closed when the state machine terminates
    46  	doneChan chan struct{}
    47  }
    48  
    49  // New creates a new scheduler.
    50  func New(store *store.MemoryStore) *Scheduler {
    51  	return &Scheduler{
    52  		store:                   store,
    53  		unassignedTasks:         make(map[string]*api.Task),
    54  		pendingPreassignedTasks: make(map[string]*api.Task),
    55  		preassignedTasks:        make(map[string]struct{}),
    56  		allTasks:                make(map[string]*api.Task),
    57  		stopChan:                make(chan struct{}),
    58  		doneChan:                make(chan struct{}),
    59  		pipeline:                NewPipeline(),
    60  	}
    61  }
    62  
    63  func (s *Scheduler) setupTasksList(tx store.ReadTx) error {
    64  	tasks, err := store.FindTasks(tx, store.All)
    65  	if err != nil {
    66  		return err
    67  	}
    68  
    69  	tasksByNode := make(map[string]map[string]*api.Task)
    70  	for _, t := range tasks {
    71  		// Ignore all tasks that have not reached PENDING
    72  		// state and tasks that no longer consume resources.
    73  		if t.Status.State < api.TaskStatePending || t.Status.State > api.TaskStateRunning {
    74  			continue
    75  		}
    76  
    77  		// Also ignore tasks that have not yet been assigned but desired state
    78  		// is beyond TaskStateCompleted. This can happen if you update, delete
    79  		// or scale down a service before its tasks were assigned.
    80  		if t.Status.State == api.TaskStatePending && t.DesiredState > api.TaskStateCompleted {
    81  			continue
    82  		}
    83  
    84  		s.allTasks[t.ID] = t
    85  		if t.NodeID == "" {
    86  			s.enqueue(t)
    87  			continue
    88  		}
    89  		// preassigned tasks need to validate resource requirement on corresponding node
    90  		if t.Status.State == api.TaskStatePending {
    91  			s.preassignedTasks[t.ID] = struct{}{}
    92  			s.pendingPreassignedTasks[t.ID] = t
    93  			continue
    94  		}
    95  
    96  		if tasksByNode[t.NodeID] == nil {
    97  			tasksByNode[t.NodeID] = make(map[string]*api.Task)
    98  		}
    99  		tasksByNode[t.NodeID][t.ID] = t
   100  	}
   101  
   102  	return s.buildNodeSet(tx, tasksByNode)
   103  }
   104  
   105  // Run is the scheduler event loop.
   106  func (s *Scheduler) Run(ctx context.Context) error {
   107  	defer close(s.doneChan)
   108  
   109  	updates, cancel, err := store.ViewAndWatch(s.store, s.setupTasksList)
   110  	if err != nil {
   111  		log.G(ctx).WithError(err).Errorf("snapshot store update failed")
   112  		return err
   113  	}
   114  	defer cancel()
   115  
   116  	// Validate resource for tasks from preassigned tasks
   117  	// do this before other tasks because preassigned tasks like
   118  	// global service should start before other tasks
   119  	s.processPreassignedTasks(ctx)
   120  
   121  	// Queue all unassigned tasks before processing changes.
   122  	s.tick(ctx)
   123  
   124  	const (
   125  		// commitDebounceGap is the amount of time to wait between
   126  		// commit events to debounce them.
   127  		commitDebounceGap = 50 * time.Millisecond
   128  		// maxLatency is a time limit on the debouncing.
   129  		maxLatency = time.Second
   130  	)
   131  	var (
   132  		debouncingStarted     time.Time
   133  		commitDebounceTimer   *time.Timer
   134  		commitDebounceTimeout <-chan time.Time
   135  	)
   136  
   137  	tickRequired := false
   138  
   139  	schedule := func() {
   140  		if len(s.pendingPreassignedTasks) > 0 {
   141  			s.processPreassignedTasks(ctx)
   142  		}
   143  		if tickRequired {
   144  			s.tick(ctx)
   145  			tickRequired = false
   146  		}
   147  	}
   148  
   149  	// Watch for changes.
   150  	for {
   151  		select {
   152  		case event := <-updates:
   153  			switch v := event.(type) {
   154  			case api.EventCreateTask:
   155  				if s.createTask(ctx, v.Task) {
   156  					tickRequired = true
   157  				}
   158  			case api.EventUpdateTask:
   159  				if s.updateTask(ctx, v.Task) {
   160  					tickRequired = true
   161  				}
   162  			case api.EventDeleteTask:
   163  				if s.deleteTask(v.Task) {
   164  					// deleting tasks may free up node resource, pending tasks should be re-evaluated.
   165  					tickRequired = true
   166  				}
   167  			case api.EventCreateNode:
   168  				s.createOrUpdateNode(v.Node)
   169  				tickRequired = true
   170  			case api.EventUpdateNode:
   171  				s.createOrUpdateNode(v.Node)
   172  				tickRequired = true
   173  			case api.EventDeleteNode:
   174  				s.nodeSet.remove(v.Node.ID)
   175  			case state.EventCommit:
   176  				if commitDebounceTimer != nil {
   177  					if time.Since(debouncingStarted) > maxLatency {
   178  						commitDebounceTimer.Stop()
   179  						commitDebounceTimer = nil
   180  						commitDebounceTimeout = nil
   181  						schedule()
   182  					} else {
   183  						commitDebounceTimer.Reset(commitDebounceGap)
   184  					}
   185  				} else {
   186  					commitDebounceTimer = time.NewTimer(commitDebounceGap)
   187  					commitDebounceTimeout = commitDebounceTimer.C
   188  					debouncingStarted = time.Now()
   189  				}
   190  			}
   191  		case <-commitDebounceTimeout:
   192  			schedule()
   193  			commitDebounceTimer = nil
   194  			commitDebounceTimeout = nil
   195  		case <-s.stopChan:
   196  			return nil
   197  		}
   198  	}
   199  }
   200  
   201  // Stop causes the scheduler event loop to stop running.
   202  func (s *Scheduler) Stop() {
   203  	close(s.stopChan)
   204  	<-s.doneChan
   205  }
   206  
   207  // enqueue queues a task for scheduling.
   208  func (s *Scheduler) enqueue(t *api.Task) {
   209  	s.unassignedTasks[t.ID] = t
   210  }
   211  
   212  func (s *Scheduler) createTask(ctx context.Context, t *api.Task) bool {
   213  	// Ignore all tasks that have not reached PENDING
   214  	// state, and tasks that no longer consume resources.
   215  	if t.Status.State < api.TaskStatePending || t.Status.State > api.TaskStateRunning {
   216  		return false
   217  	}
   218  
   219  	s.allTasks[t.ID] = t
   220  	if t.NodeID == "" {
   221  		// unassigned task
   222  		s.enqueue(t)
   223  		return true
   224  	}
   225  
   226  	if t.Status.State == api.TaskStatePending {
   227  		s.preassignedTasks[t.ID] = struct{}{}
   228  		s.pendingPreassignedTasks[t.ID] = t
   229  		// preassigned tasks do not contribute to running tasks count
   230  		return false
   231  	}
   232  
   233  	nodeInfo, err := s.nodeSet.nodeInfo(t.NodeID)
   234  	if err == nil && nodeInfo.addTask(t) {
   235  		s.nodeSet.updateNode(nodeInfo)
   236  	}
   237  
   238  	return false
   239  }
   240  
   241  func (s *Scheduler) updateTask(ctx context.Context, t *api.Task) bool {
   242  	// Ignore all tasks that have not reached PENDING
   243  	// state.
   244  	if t.Status.State < api.TaskStatePending {
   245  		return false
   246  	}
   247  
   248  	oldTask := s.allTasks[t.ID]
   249  
   250  	// Ignore all tasks that have not reached Pending
   251  	// state, and tasks that no longer consume resources.
   252  	if t.Status.State > api.TaskStateRunning {
   253  		if oldTask == nil {
   254  			return false
   255  		}
   256  
   257  		if t.Status.State != oldTask.Status.State &&
   258  			(t.Status.State == api.TaskStateFailed || t.Status.State == api.TaskStateRejected) {
   259  			// Keep track of task failures, so other nodes can be preferred
   260  			// for scheduling this service if it looks like the service is
   261  			// failing in a loop on this node. However, skip this for
   262  			// preassigned tasks, because the scheduler does not choose
   263  			// which nodes those run on.
   264  			if _, wasPreassigned := s.preassignedTasks[t.ID]; !wasPreassigned {
   265  				nodeInfo, err := s.nodeSet.nodeInfo(t.NodeID)
   266  				if err == nil {
   267  					nodeInfo.taskFailed(ctx, t)
   268  					s.nodeSet.updateNode(nodeInfo)
   269  				}
   270  			}
   271  		}
   272  
   273  		s.deleteTask(oldTask)
   274  
   275  		return true
   276  	}
   277  
   278  	if t.NodeID == "" {
   279  		// unassigned task
   280  		if oldTask != nil {
   281  			s.deleteTask(oldTask)
   282  		}
   283  		s.allTasks[t.ID] = t
   284  		s.enqueue(t)
   285  		return true
   286  	}
   287  
   288  	if t.Status.State == api.TaskStatePending {
   289  		if oldTask != nil {
   290  			s.deleteTask(oldTask)
   291  		}
   292  		s.preassignedTasks[t.ID] = struct{}{}
   293  		s.allTasks[t.ID] = t
   294  		s.pendingPreassignedTasks[t.ID] = t
   295  		// preassigned tasks do not contribute to running tasks count
   296  		return false
   297  	}
   298  
   299  	s.allTasks[t.ID] = t
   300  	nodeInfo, err := s.nodeSet.nodeInfo(t.NodeID)
   301  	if err == nil && nodeInfo.addTask(t) {
   302  		s.nodeSet.updateNode(nodeInfo)
   303  	}
   304  
   305  	return false
   306  }
   307  
   308  func (s *Scheduler) deleteTask(t *api.Task) bool {
   309  	delete(s.allTasks, t.ID)
   310  	delete(s.preassignedTasks, t.ID)
   311  	delete(s.pendingPreassignedTasks, t.ID)
   312  	nodeInfo, err := s.nodeSet.nodeInfo(t.NodeID)
   313  	if err == nil && nodeInfo.removeTask(t) {
   314  		s.nodeSet.updateNode(nodeInfo)
   315  		return true
   316  	}
   317  	return false
   318  }
   319  
   320  func (s *Scheduler) createOrUpdateNode(n *api.Node) {
   321  	nodeInfo, nodeInfoErr := s.nodeSet.nodeInfo(n.ID)
   322  	var resources *api.Resources
   323  	if n.Description != nil && n.Description.Resources != nil {
   324  		resources = n.Description.Resources.Copy()
   325  		// reconcile resources by looping over all tasks in this node
   326  		if nodeInfoErr == nil {
   327  			for _, task := range nodeInfo.Tasks {
   328  				reservations := taskReservations(task.Spec)
   329  
   330  				resources.MemoryBytes -= reservations.MemoryBytes
   331  				resources.NanoCPUs -= reservations.NanoCPUs
   332  
   333  				genericresource.ConsumeNodeResources(&resources.Generic,
   334  					task.AssignedGenericResources)
   335  			}
   336  		}
   337  	} else {
   338  		resources = &api.Resources{}
   339  	}
   340  
   341  	if nodeInfoErr != nil {
   342  		nodeInfo = newNodeInfo(n, nil, *resources)
   343  	} else {
   344  		nodeInfo.Node = n
   345  		nodeInfo.AvailableResources = resources
   346  	}
   347  	s.nodeSet.addOrUpdateNode(nodeInfo)
   348  }
   349  
   350  func (s *Scheduler) processPreassignedTasks(ctx context.Context) {
   351  	schedulingDecisions := make(map[string]schedulingDecision, len(s.pendingPreassignedTasks))
   352  	for _, t := range s.pendingPreassignedTasks {
   353  		newT := s.taskFitNode(ctx, t, t.NodeID)
   354  		if newT == nil {
   355  			continue
   356  		}
   357  		schedulingDecisions[t.ID] = schedulingDecision{old: t, new: newT}
   358  	}
   359  
   360  	successful, failed := s.applySchedulingDecisions(ctx, schedulingDecisions)
   361  
   362  	for _, decision := range successful {
   363  		if decision.new.Status.State == api.TaskStateAssigned {
   364  			delete(s.pendingPreassignedTasks, decision.old.ID)
   365  		}
   366  	}
   367  	for _, decision := range failed {
   368  		s.allTasks[decision.old.ID] = decision.old
   369  		nodeInfo, err := s.nodeSet.nodeInfo(decision.new.NodeID)
   370  		if err == nil && nodeInfo.removeTask(decision.new) {
   371  			s.nodeSet.updateNode(nodeInfo)
   372  		}
   373  	}
   374  }
   375  
   376  // tick attempts to schedule the queue.
   377  func (s *Scheduler) tick(ctx context.Context) {
   378  	type commonSpecKey struct {
   379  		serviceID   string
   380  		specVersion api.Version
   381  	}
   382  	tasksByCommonSpec := make(map[commonSpecKey]map[string]*api.Task)
   383  	var oneOffTasks []*api.Task
   384  	schedulingDecisions := make(map[string]schedulingDecision, len(s.unassignedTasks))
   385  
   386  	for taskID, t := range s.unassignedTasks {
   387  		if t == nil || t.NodeID != "" {
   388  			// task deleted or already assigned
   389  			delete(s.unassignedTasks, taskID)
   390  			continue
   391  		}
   392  
   393  		// Group tasks with common specs
   394  		if t.SpecVersion != nil {
   395  			taskGroupKey := commonSpecKey{
   396  				serviceID:   t.ServiceID,
   397  				specVersion: *t.SpecVersion,
   398  			}
   399  
   400  			if tasksByCommonSpec[taskGroupKey] == nil {
   401  				tasksByCommonSpec[taskGroupKey] = make(map[string]*api.Task)
   402  			}
   403  			tasksByCommonSpec[taskGroupKey][taskID] = t
   404  		} else {
   405  			// This task doesn't have a spec version. We have to
   406  			// schedule it as a one-off.
   407  			oneOffTasks = append(oneOffTasks, t)
   408  		}
   409  		delete(s.unassignedTasks, taskID)
   410  	}
   411  
   412  	for _, taskGroup := range tasksByCommonSpec {
   413  		s.scheduleTaskGroup(ctx, taskGroup, schedulingDecisions)
   414  	}
   415  	for _, t := range oneOffTasks {
   416  		s.scheduleTaskGroup(ctx, map[string]*api.Task{t.ID: t}, schedulingDecisions)
   417  	}
   418  
   419  	_, failed := s.applySchedulingDecisions(ctx, schedulingDecisions)
   420  	for _, decision := range failed {
   421  		s.allTasks[decision.old.ID] = decision.old
   422  
   423  		nodeInfo, err := s.nodeSet.nodeInfo(decision.new.NodeID)
   424  		if err == nil && nodeInfo.removeTask(decision.new) {
   425  			s.nodeSet.updateNode(nodeInfo)
   426  		}
   427  
   428  		// enqueue task for next scheduling attempt
   429  		s.enqueue(decision.old)
   430  	}
   431  }
   432  
   433  func (s *Scheduler) applySchedulingDecisions(ctx context.Context, schedulingDecisions map[string]schedulingDecision) (successful, failed []schedulingDecision) {
   434  	if len(schedulingDecisions) == 0 {
   435  		return
   436  	}
   437  
   438  	successful = make([]schedulingDecision, 0, len(schedulingDecisions))
   439  
   440  	// Apply changes to master store
   441  	err := s.store.Batch(func(batch *store.Batch) error {
   442  		for len(schedulingDecisions) > 0 {
   443  			err := batch.Update(func(tx store.Tx) error {
   444  				// Update exactly one task inside this Update
   445  				// callback.
   446  				for taskID, decision := range schedulingDecisions {
   447  					delete(schedulingDecisions, taskID)
   448  
   449  					t := store.GetTask(tx, taskID)
   450  					if t == nil {
   451  						// Task no longer exists
   452  						s.deleteTask(decision.new)
   453  						continue
   454  					}
   455  
   456  					if t.Status.State == decision.new.Status.State &&
   457  						t.Status.Message == decision.new.Status.Message &&
   458  						t.Status.Err == decision.new.Status.Err {
   459  						// No changes, ignore
   460  						continue
   461  					}
   462  
   463  					if t.Status.State >= api.TaskStateAssigned {
   464  						nodeInfo, err := s.nodeSet.nodeInfo(decision.new.NodeID)
   465  						if err != nil {
   466  							failed = append(failed, decision)
   467  							continue
   468  						}
   469  						node := store.GetNode(tx, decision.new.NodeID)
   470  						if node == nil || node.Meta.Version != nodeInfo.Meta.Version {
   471  							// node is out of date
   472  							failed = append(failed, decision)
   473  							continue
   474  						}
   475  					}
   476  
   477  					if err := store.UpdateTask(tx, decision.new); err != nil {
   478  						log.G(ctx).Debugf("scheduler failed to update task %s; will retry", taskID)
   479  						failed = append(failed, decision)
   480  						continue
   481  					}
   482  					successful = append(successful, decision)
   483  					return nil
   484  				}
   485  				return nil
   486  			})
   487  			if err != nil {
   488  				return err
   489  			}
   490  		}
   491  		return nil
   492  	})
   493  
   494  	if err != nil {
   495  		log.G(ctx).WithError(err).Error("scheduler tick transaction failed")
   496  		failed = append(failed, successful...)
   497  		successful = nil
   498  	}
   499  	return
   500  }
   501  
   502  // taskFitNode checks if a node has enough resources to accommodate a task.
   503  func (s *Scheduler) taskFitNode(ctx context.Context, t *api.Task, nodeID string) *api.Task {
   504  	nodeInfo, err := s.nodeSet.nodeInfo(nodeID)
   505  	if err != nil {
   506  		// node does not exist in set (it may have been deleted)
   507  		return nil
   508  	}
   509  	newT := *t
   510  	s.pipeline.SetTask(t)
   511  	if !s.pipeline.Process(&nodeInfo) {
   512  		// this node cannot accommodate this task
   513  		newT.Status.Timestamp = ptypes.MustTimestampProto(time.Now())
   514  		newT.Status.Err = s.pipeline.Explain()
   515  		s.allTasks[t.ID] = &newT
   516  
   517  		return &newT
   518  	}
   519  	newT.Status = api.TaskStatus{
   520  		State:     api.TaskStateAssigned,
   521  		Timestamp: ptypes.MustTimestampProto(time.Now()),
   522  		Message:   "scheduler confirmed task can run on preassigned node",
   523  	}
   524  	s.allTasks[t.ID] = &newT
   525  
   526  	if nodeInfo.addTask(&newT) {
   527  		s.nodeSet.updateNode(nodeInfo)
   528  	}
   529  	return &newT
   530  }
   531  
   532  // scheduleTaskGroup schedules a batch of tasks that are part of the same
   533  // service and share the same version of the spec.
   534  func (s *Scheduler) scheduleTaskGroup(ctx context.Context, taskGroup map[string]*api.Task, schedulingDecisions map[string]schedulingDecision) {
   535  	// Pick at task at random from taskGroup to use for constraint
   536  	// evaluation. It doesn't matter which one we pick because all the
   537  	// tasks in the group are equal in terms of the fields the constraint
   538  	// filters consider.
   539  	var t *api.Task
   540  	for _, t = range taskGroup {
   541  		break
   542  	}
   543  
   544  	s.pipeline.SetTask(t)
   545  
   546  	now := time.Now()
   547  
   548  	nodeLess := func(a *NodeInfo, b *NodeInfo) bool {
   549  		// If either node has at least maxFailures recent failures,
   550  		// that's the deciding factor.
   551  		recentFailuresA := a.countRecentFailures(now, t)
   552  		recentFailuresB := b.countRecentFailures(now, t)
   553  
   554  		if recentFailuresA >= maxFailures || recentFailuresB >= maxFailures {
   555  			if recentFailuresA > recentFailuresB {
   556  				return false
   557  			}
   558  			if recentFailuresB > recentFailuresA {
   559  				return true
   560  			}
   561  		}
   562  
   563  		tasksByServiceA := a.ActiveTasksCountByService[t.ServiceID]
   564  		tasksByServiceB := b.ActiveTasksCountByService[t.ServiceID]
   565  
   566  		if tasksByServiceA < tasksByServiceB {
   567  			return true
   568  		}
   569  		if tasksByServiceA > tasksByServiceB {
   570  			return false
   571  		}
   572  
   573  		// Total number of tasks breaks ties.
   574  		return a.ActiveTasksCount < b.ActiveTasksCount
   575  	}
   576  
   577  	var prefs []*api.PlacementPreference
   578  	if t.Spec.Placement != nil {
   579  		prefs = t.Spec.Placement.Preferences
   580  	}
   581  
   582  	tree := s.nodeSet.tree(t.ServiceID, prefs, len(taskGroup), s.pipeline.Process, nodeLess)
   583  
   584  	s.scheduleNTasksOnSubtree(ctx, len(taskGroup), taskGroup, &tree, schedulingDecisions, nodeLess)
   585  	if len(taskGroup) != 0 {
   586  		s.noSuitableNode(ctx, taskGroup, schedulingDecisions)
   587  	}
   588  }
   589  
   590  func (s *Scheduler) scheduleNTasksOnSubtree(ctx context.Context, n int, taskGroup map[string]*api.Task, tree *decisionTree, schedulingDecisions map[string]schedulingDecision, nodeLess func(a *NodeInfo, b *NodeInfo) bool) int {
   591  	if tree.next == nil {
   592  		nodes := tree.orderedNodes(s.pipeline.Process, nodeLess)
   593  		if len(nodes) == 0 {
   594  			return 0
   595  		}
   596  
   597  		return s.scheduleNTasksOnNodes(ctx, n, taskGroup, nodes, schedulingDecisions, nodeLess)
   598  	}
   599  
   600  	// Walk the tree and figure out how the tasks should be split at each
   601  	// level.
   602  	tasksScheduled := 0
   603  	tasksInUsableBranches := tree.tasks
   604  	var noRoom map[*decisionTree]struct{}
   605  
   606  	// Try to make branches even until either all branches are
   607  	// full, or all tasks have been scheduled.
   608  	for tasksScheduled != n && len(noRoom) != len(tree.next) {
   609  		desiredTasksPerBranch := (tasksInUsableBranches + n - tasksScheduled) / (len(tree.next) - len(noRoom))
   610  		remainder := (tasksInUsableBranches + n - tasksScheduled) % (len(tree.next) - len(noRoom))
   611  
   612  		for _, subtree := range tree.next {
   613  			if noRoom != nil {
   614  				if _, ok := noRoom[subtree]; ok {
   615  					continue
   616  				}
   617  			}
   618  			subtreeTasks := subtree.tasks
   619  			if subtreeTasks < desiredTasksPerBranch || (subtreeTasks == desiredTasksPerBranch && remainder > 0) {
   620  				tasksToAssign := desiredTasksPerBranch - subtreeTasks
   621  				if remainder > 0 {
   622  					tasksToAssign++
   623  				}
   624  				res := s.scheduleNTasksOnSubtree(ctx, tasksToAssign, taskGroup, subtree, schedulingDecisions, nodeLess)
   625  				if res < tasksToAssign {
   626  					if noRoom == nil {
   627  						noRoom = make(map[*decisionTree]struct{})
   628  					}
   629  					noRoom[subtree] = struct{}{}
   630  					tasksInUsableBranches -= subtreeTasks
   631  				} else if remainder > 0 {
   632  					remainder--
   633  				}
   634  				tasksScheduled += res
   635  			}
   636  		}
   637  	}
   638  
   639  	return tasksScheduled
   640  }
   641  
   642  func (s *Scheduler) scheduleNTasksOnNodes(ctx context.Context, n int, taskGroup map[string]*api.Task, nodes []NodeInfo, schedulingDecisions map[string]schedulingDecision, nodeLess func(a *NodeInfo, b *NodeInfo) bool) int {
   643  	tasksScheduled := 0
   644  	failedConstraints := make(map[int]bool) // key is index in nodes slice
   645  	nodeIter := 0
   646  	nodeCount := len(nodes)
   647  	for taskID, t := range taskGroup {
   648  		// Skip tasks which were already scheduled because they ended
   649  		// up in two groups at once.
   650  		if _, exists := schedulingDecisions[taskID]; exists {
   651  			continue
   652  		}
   653  
   654  		node := &nodes[nodeIter%nodeCount]
   655  
   656  		log.G(ctx).WithField("task.id", t.ID).Debugf("assigning to node %s", node.ID)
   657  		newT := *t
   658  		newT.NodeID = node.ID
   659  		newT.Status = api.TaskStatus{
   660  			State:     api.TaskStateAssigned,
   661  			Timestamp: ptypes.MustTimestampProto(time.Now()),
   662  			Message:   "scheduler assigned task to node",
   663  		}
   664  		s.allTasks[t.ID] = &newT
   665  
   666  		nodeInfo, err := s.nodeSet.nodeInfo(node.ID)
   667  		if err == nil && nodeInfo.addTask(&newT) {
   668  			s.nodeSet.updateNode(nodeInfo)
   669  			nodes[nodeIter%nodeCount] = nodeInfo
   670  		}
   671  
   672  		schedulingDecisions[taskID] = schedulingDecision{old: t, new: &newT}
   673  		delete(taskGroup, taskID)
   674  		tasksScheduled++
   675  		if tasksScheduled == n {
   676  			return tasksScheduled
   677  		}
   678  
   679  		if nodeIter+1 < nodeCount {
   680  			// First pass fills the nodes until they have the same
   681  			// number of tasks from this service.
   682  			nextNode := nodes[(nodeIter+1)%nodeCount]
   683  			if nodeLess(&nextNode, &nodeInfo) {
   684  				nodeIter++
   685  			}
   686  		} else {
   687  			// In later passes, we just assign one task at a time
   688  			// to each node that still meets the constraints.
   689  			nodeIter++
   690  		}
   691  
   692  		origNodeIter := nodeIter
   693  		for failedConstraints[nodeIter%nodeCount] || !s.pipeline.Process(&nodes[nodeIter%nodeCount]) {
   694  			failedConstraints[nodeIter%nodeCount] = true
   695  			nodeIter++
   696  			if nodeIter-origNodeIter == nodeCount {
   697  				// None of the nodes meet the constraints anymore.
   698  				return tasksScheduled
   699  			}
   700  		}
   701  	}
   702  
   703  	return tasksScheduled
   704  }
   705  
   706  // noSuitableNode checks unassigned tasks and make sure they have an existing service in the store before
   707  // updating the task status and adding it back to: schedulingDecisions, unassignedTasks and allTasks
   708  func (s *Scheduler) noSuitableNode(ctx context.Context, taskGroup map[string]*api.Task, schedulingDecisions map[string]schedulingDecision) {
   709  	explanation := s.pipeline.Explain()
   710  	for _, t := range taskGroup {
   711  		var service *api.Service
   712  		s.store.View(func(tx store.ReadTx) {
   713  			service = store.GetService(tx, t.ServiceID)
   714  		})
   715  		if service == nil {
   716  			log.G(ctx).WithField("task.id", t.ID).Debug("removing task from the scheduler")
   717  			continue
   718  		}
   719  
   720  		log.G(ctx).WithField("task.id", t.ID).Debug("no suitable node available for task")
   721  
   722  		newT := *t
   723  		newT.Status.Timestamp = ptypes.MustTimestampProto(time.Now())
   724  		sv := service.SpecVersion
   725  		tv := newT.SpecVersion
   726  		if sv != nil && tv != nil && sv.Index > tv.Index {
   727  			log.G(ctx).WithField("task.id", t.ID).Debug(
   728  				"task belongs to old revision of service",
   729  			)
   730  			if t.Status.State == api.TaskStatePending && t.DesiredState >= api.TaskStateShutdown {
   731  				log.G(ctx).WithField("task.id", t.ID).Debug(
   732  					"task is desired shutdown, scheduler will go ahead and do so",
   733  				)
   734  				newT.Status.State = api.TaskStateShutdown
   735  				newT.Status.Err = ""
   736  			}
   737  		} else {
   738  			if explanation != "" {
   739  				newT.Status.Err = "no suitable node (" + explanation + ")"
   740  			} else {
   741  				newT.Status.Err = "no suitable node"
   742  			}
   743  
   744  			// re-enqueue a task that should still be attempted
   745  			s.enqueue(&newT)
   746  		}
   747  
   748  		s.allTasks[t.ID] = &newT
   749  		schedulingDecisions[t.ID] = schedulingDecision{old: t, new: &newT}
   750  	}
   751  }
   752  
   753  func (s *Scheduler) buildNodeSet(tx store.ReadTx, tasksByNode map[string]map[string]*api.Task) error {
   754  	nodes, err := store.FindNodes(tx, store.All)
   755  	if err != nil {
   756  		return err
   757  	}
   758  
   759  	s.nodeSet.alloc(len(nodes))
   760  
   761  	for _, n := range nodes {
   762  		var resources api.Resources
   763  		if n.Description != nil && n.Description.Resources != nil {
   764  			resources = *n.Description.Resources
   765  		}
   766  		s.nodeSet.addOrUpdateNode(newNodeInfo(n, tasksByNode[n.ID], resources))
   767  	}
   768  
   769  	return nil
   770  }