github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/scheduler/util.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"math/rand"
     7  	"reflect"
     8  
     9  	memdb "github.com/hashicorp/go-memdb"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  // allocTuple is a tuple of the allocation name and potential alloc ID
    14  type allocTuple struct {
    15  	Name      string
    16  	TaskGroup *structs.TaskGroup
    17  	Alloc     *structs.Allocation
    18  }
    19  
    20  // materializeTaskGroups is used to materialize all the task groups
    21  // a job requires. This is used to do the count expansion.
    22  func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup {
    23  	out := make(map[string]*structs.TaskGroup)
    24  	if job.Stopped() {
    25  		return out
    26  	}
    27  
    28  	for _, tg := range job.TaskGroups {
    29  		for i := 0; i < tg.Count; i++ {
    30  			name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i)
    31  			out[name] = tg
    32  		}
    33  	}
    34  	return out
    35  }
    36  
    37  // diffResult is used to return the sets that result from the diff
    38  type diffResult struct {
    39  	place, update, migrate, stop, ignore, lost []allocTuple
    40  }
    41  
    42  func (d *diffResult) GoString() string {
    43  	return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d) (lost %d)",
    44  		len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore), len(d.lost))
    45  }
    46  
    47  func (d *diffResult) Append(other *diffResult) {
    48  	d.place = append(d.place, other.place...)
    49  	d.update = append(d.update, other.update...)
    50  	d.migrate = append(d.migrate, other.migrate...)
    51  	d.stop = append(d.stop, other.stop...)
    52  	d.ignore = append(d.ignore, other.ignore...)
    53  	d.lost = append(d.lost, other.lost...)
    54  }
    55  
    56  // diffAllocs is used to do a set difference between the target allocations
    57  // and the existing allocations. This returns 6 sets of results, the list of
    58  // named task groups that need to be placed (no existing allocation), the
    59  // allocations that need to be updated (job definition is newer), allocs that
    60  // need to be migrated (node is draining), the allocs that need to be evicted
    61  // (no longer required), those that should be ignored and those that are lost
    62  // that need to be replaced (running on a lost node).
    63  //
    64  // job is the job whose allocs is going to be diff-ed.
    65  // taintedNodes is an index of the nodes which are either down or in drain mode
    66  // by name.
    67  // required is a set of allocations that must exist.
    68  // allocs is a list of non terminal allocations.
    69  // terminalAllocs is an index of the latest terminal allocations by name.
    70  func diffAllocs(job *structs.Job, taintedNodes map[string]*structs.Node,
    71  	required map[string]*structs.TaskGroup, allocs []*structs.Allocation,
    72  	terminalAllocs map[string]*structs.Allocation) *diffResult {
    73  	result := &diffResult{}
    74  
    75  	// Scan the existing updates
    76  	existing := make(map[string]struct{})
    77  	for _, exist := range allocs {
    78  		// Index the existing node
    79  		name := exist.Name
    80  		existing[name] = struct{}{}
    81  
    82  		// Check for the definition in the required set
    83  		tg, ok := required[name]
    84  
    85  		// If not required, we stop the alloc
    86  		if !ok {
    87  			result.stop = append(result.stop, allocTuple{
    88  				Name:      name,
    89  				TaskGroup: tg,
    90  				Alloc:     exist,
    91  			})
    92  			continue
    93  		}
    94  
    95  		// If we are on a tainted node, we must migrate if we are a service or
    96  		// if the batch allocation did not finish
    97  		if node, ok := taintedNodes[exist.NodeID]; ok {
    98  			// If the job is batch and finished successfully, the fact that the
    99  			// node is tainted does not mean it should be migrated or marked as
   100  			// lost as the work was already successfully finished. However for
   101  			// service/system jobs, tasks should never complete. The check of
   102  			// batch type, defends against client bugs.
   103  			if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() {
   104  				goto IGNORE
   105  			}
   106  
   107  			if node == nil || node.TerminalStatus() {
   108  				result.lost = append(result.lost, allocTuple{
   109  					Name:      name,
   110  					TaskGroup: tg,
   111  					Alloc:     exist,
   112  				})
   113  			} else {
   114  				// This is the drain case
   115  				result.migrate = append(result.migrate, allocTuple{
   116  					Name:      name,
   117  					TaskGroup: tg,
   118  					Alloc:     exist,
   119  				})
   120  			}
   121  			continue
   122  		}
   123  
   124  		// If the definition is updated we need to update
   125  		if job.JobModifyIndex != exist.Job.JobModifyIndex {
   126  			result.update = append(result.update, allocTuple{
   127  				Name:      name,
   128  				TaskGroup: tg,
   129  				Alloc:     exist,
   130  			})
   131  			continue
   132  		}
   133  
   134  		// Everything is up-to-date
   135  	IGNORE:
   136  		result.ignore = append(result.ignore, allocTuple{
   137  			Name:      name,
   138  			TaskGroup: tg,
   139  			Alloc:     exist,
   140  		})
   141  	}
   142  
   143  	// Scan the required groups
   144  	for name, tg := range required {
   145  		// Check for an existing allocation
   146  		_, ok := existing[name]
   147  
   148  		// Require a placement if no existing allocation. If there
   149  		// is an existing allocation, we would have checked for a potential
   150  		// update or ignore above.
   151  		if !ok {
   152  			result.place = append(result.place, allocTuple{
   153  				Name:      name,
   154  				TaskGroup: tg,
   155  				Alloc:     terminalAllocs[name],
   156  			})
   157  		}
   158  	}
   159  	return result
   160  }
   161  
   162  // diffSystemAllocs is like diffAllocs however, the allocations in the
   163  // diffResult contain the specific nodeID they should be allocated on.
   164  //
   165  // job is the job whose allocs is going to be diff-ed.
   166  // nodes is a list of nodes in ready state.
   167  // taintedNodes is an index of the nodes which are either down or in drain mode
   168  // by name.
   169  // allocs is a list of non terminal allocations.
   170  // terminalAllocs is an index of the latest terminal allocations by name.
   171  func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]*structs.Node,
   172  	allocs []*structs.Allocation, terminalAllocs map[string]*structs.Allocation) *diffResult {
   173  
   174  	// Build a mapping of nodes to all their allocs.
   175  	nodeAllocs := make(map[string][]*structs.Allocation, len(allocs))
   176  	for _, alloc := range allocs {
   177  		nallocs := append(nodeAllocs[alloc.NodeID], alloc)
   178  		nodeAllocs[alloc.NodeID] = nallocs
   179  	}
   180  
   181  	for _, node := range nodes {
   182  		if _, ok := nodeAllocs[node.ID]; !ok {
   183  			nodeAllocs[node.ID] = nil
   184  		}
   185  	}
   186  
   187  	// Create the required task groups.
   188  	required := materializeTaskGroups(job)
   189  
   190  	result := &diffResult{}
   191  	for nodeID, allocs := range nodeAllocs {
   192  		diff := diffAllocs(job, taintedNodes, required, allocs, terminalAllocs)
   193  
   194  		// If the node is tainted there should be no placements made
   195  		if _, ok := taintedNodes[nodeID]; ok {
   196  			diff.place = nil
   197  		} else {
   198  			// Mark the alloc as being for a specific node.
   199  			for i := range diff.place {
   200  				alloc := &diff.place[i]
   201  
   202  				// If the new allocation isn't annotated with a previous allocation
   203  				// or if the previous allocation isn't from the same node then we
   204  				// annotate the allocTuple with a new Allocation
   205  				if alloc.Alloc == nil || alloc.Alloc.NodeID != nodeID {
   206  					alloc.Alloc = &structs.Allocation{NodeID: nodeID}
   207  				}
   208  			}
   209  		}
   210  
   211  		// Migrate does not apply to system jobs and instead should be marked as
   212  		// stop because if a node is tainted, the job is invalid on that node.
   213  		diff.stop = append(diff.stop, diff.migrate...)
   214  		diff.migrate = nil
   215  
   216  		result.Append(diff)
   217  	}
   218  
   219  	return result
   220  }
   221  
   222  // readyNodesInDCs returns all the ready nodes in the given datacenters and a
   223  // mapping of each data center to the count of ready nodes.
   224  func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) {
   225  	// Index the DCs
   226  	dcMap := make(map[string]int, len(dcs))
   227  	for _, dc := range dcs {
   228  		dcMap[dc] = 0
   229  	}
   230  
   231  	// Scan the nodes
   232  	ws := memdb.NewWatchSet()
   233  	var out []*structs.Node
   234  	iter, err := state.Nodes(ws)
   235  	if err != nil {
   236  		return nil, nil, err
   237  	}
   238  	for {
   239  		raw := iter.Next()
   240  		if raw == nil {
   241  			break
   242  		}
   243  
   244  		// Filter on datacenter and status
   245  		node := raw.(*structs.Node)
   246  		if node.Status != structs.NodeStatusReady {
   247  			continue
   248  		}
   249  		if node.Drain {
   250  			continue
   251  		}
   252  		if _, ok := dcMap[node.Datacenter]; !ok {
   253  			continue
   254  		}
   255  		out = append(out, node)
   256  		dcMap[node.Datacenter]++
   257  	}
   258  	return out, dcMap, nil
   259  }
   260  
   261  // retryMax is used to retry a callback until it returns success or
   262  // a maximum number of attempts is reached. An optional reset function may be
   263  // passed which is called after each failed iteration. If the reset function is
   264  // set and returns true, the number of attempts is reset back to max.
   265  func retryMax(max int, cb func() (bool, error), reset func() bool) error {
   266  	attempts := 0
   267  	for attempts < max {
   268  		done, err := cb()
   269  		if err != nil {
   270  			return err
   271  		}
   272  		if done {
   273  			return nil
   274  		}
   275  
   276  		// Check if we should reset the number attempts
   277  		if reset != nil && reset() {
   278  			attempts = 0
   279  		} else {
   280  			attempts++
   281  		}
   282  	}
   283  	return &SetStatusError{
   284  		Err:        fmt.Errorf("maximum attempts reached (%d)", max),
   285  		EvalStatus: structs.EvalStatusFailed,
   286  	}
   287  }
   288  
   289  // progressMade checks to see if the plan result made allocations or updates.
   290  // If the result is nil, false is returned.
   291  func progressMade(result *structs.PlanResult) bool {
   292  	return result != nil && (len(result.NodeUpdate) != 0 ||
   293  		len(result.NodeAllocation) != 0 || result.Deployment != nil ||
   294  		len(result.DeploymentUpdates) != 0)
   295  }
   296  
   297  // taintedNodes is used to scan the allocations and then check if the
   298  // underlying nodes are tainted, and should force a migration of the allocation.
   299  // All the nodes returned in the map are tainted.
   300  func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*structs.Node, error) {
   301  	out := make(map[string]*structs.Node)
   302  	for _, alloc := range allocs {
   303  		if _, ok := out[alloc.NodeID]; ok {
   304  			continue
   305  		}
   306  
   307  		ws := memdb.NewWatchSet()
   308  		node, err := state.NodeByID(ws, alloc.NodeID)
   309  		if err != nil {
   310  			return nil, err
   311  		}
   312  
   313  		// If the node does not exist, we should migrate
   314  		if node == nil {
   315  			out[alloc.NodeID] = nil
   316  			continue
   317  		}
   318  		if structs.ShouldDrainNode(node.Status) || node.Drain {
   319  			out[alloc.NodeID] = node
   320  		}
   321  	}
   322  	return out, nil
   323  }
   324  
   325  // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm
   326  func shuffleNodes(nodes []*structs.Node) {
   327  	n := len(nodes)
   328  	for i := n - 1; i > 0; i-- {
   329  		j := rand.Intn(i + 1)
   330  		nodes[i], nodes[j] = nodes[j], nodes[i]
   331  	}
   332  }
   333  
   334  // tasksUpdated does a diff between task groups to see if the
   335  // tasks, their drivers, environment variables or config have updated. The
   336  // inputs are the task group name to diff and two jobs to diff.
   337  func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
   338  	a := jobA.LookupTaskGroup(taskGroup)
   339  	b := jobB.LookupTaskGroup(taskGroup)
   340  
   341  	// If the number of tasks do not match, clearly there is an update
   342  	if len(a.Tasks) != len(b.Tasks) {
   343  		return true
   344  	}
   345  
   346  	// Check ephemeral disk
   347  	if !reflect.DeepEqual(a.EphemeralDisk, b.EphemeralDisk) {
   348  		return true
   349  	}
   350  
   351  	// Check each task
   352  	for _, at := range a.Tasks {
   353  		bt := b.LookupTask(at.Name)
   354  		if bt == nil {
   355  			return true
   356  		}
   357  		if at.Driver != bt.Driver {
   358  			return true
   359  		}
   360  		if at.User != bt.User {
   361  			return true
   362  		}
   363  		if !reflect.DeepEqual(at.Config, bt.Config) {
   364  			return true
   365  		}
   366  		if !reflect.DeepEqual(at.Env, bt.Env) {
   367  			return true
   368  		}
   369  		if !reflect.DeepEqual(at.Artifacts, bt.Artifacts) {
   370  			return true
   371  		}
   372  		if !reflect.DeepEqual(at.Vault, bt.Vault) {
   373  			return true
   374  		}
   375  		if !reflect.DeepEqual(at.Templates, bt.Templates) {
   376  			return true
   377  		}
   378  
   379  		// Check the metadata
   380  		if !reflect.DeepEqual(
   381  			jobA.CombinedTaskMeta(taskGroup, at.Name),
   382  			jobB.CombinedTaskMeta(taskGroup, bt.Name)) {
   383  			return true
   384  		}
   385  
   386  		// Inspect the network to see if the dynamic ports are different
   387  		if len(at.Resources.Networks) != len(bt.Resources.Networks) {
   388  			return true
   389  		}
   390  		for idx := range at.Resources.Networks {
   391  			an := at.Resources.Networks[idx]
   392  			bn := bt.Resources.Networks[idx]
   393  
   394  			if an.MBits != bn.MBits {
   395  				return true
   396  			}
   397  
   398  			aPorts, bPorts := networkPortMap(an), networkPortMap(bn)
   399  			if !reflect.DeepEqual(aPorts, bPorts) {
   400  				return true
   401  			}
   402  		}
   403  
   404  		// Inspect the non-network resources
   405  		if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU {
   406  			return true
   407  		} else if ar.MemoryMB != br.MemoryMB {
   408  			return true
   409  		} else if ar.IOPS != br.IOPS {
   410  			return true
   411  		}
   412  	}
   413  	return false
   414  }
   415  
   416  // networkPortMap takes a network resource and returns a map of port labels to
   417  // values. The value for dynamic ports is disregarded even if it is set. This
   418  // makes this function suitable for comparing two network resources for changes.
   419  func networkPortMap(n *structs.NetworkResource) map[string]int {
   420  	m := make(map[string]int, len(n.DynamicPorts)+len(n.ReservedPorts))
   421  	for _, p := range n.ReservedPorts {
   422  		m[p.Label] = p.Value
   423  	}
   424  	for _, p := range n.DynamicPorts {
   425  		m[p.Label] = -1
   426  	}
   427  	return m
   428  }
   429  
   430  // setStatus is used to update the status of the evaluation
   431  func setStatus(logger *log.Logger, planner Planner,
   432  	eval, nextEval, spawnedBlocked *structs.Evaluation,
   433  	tgMetrics map[string]*structs.AllocMetric, status, desc string,
   434  	queuedAllocs map[string]int, deploymentID string) error {
   435  
   436  	logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status)
   437  	newEval := eval.Copy()
   438  	newEval.Status = status
   439  	newEval.StatusDescription = desc
   440  	newEval.DeploymentID = deploymentID
   441  	newEval.FailedTGAllocs = tgMetrics
   442  	if nextEval != nil {
   443  		newEval.NextEval = nextEval.ID
   444  	}
   445  	if spawnedBlocked != nil {
   446  		newEval.BlockedEval = spawnedBlocked.ID
   447  	}
   448  	if queuedAllocs != nil {
   449  		newEval.QueuedAllocations = queuedAllocs
   450  	}
   451  
   452  	return planner.UpdateEval(newEval)
   453  }
   454  
   455  // inplaceUpdate attempts to update allocations in-place where possible. It
   456  // returns the allocs that couldn't be done inplace and then those that could.
   457  func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job,
   458  	stack Stack, updates []allocTuple) (destructive, inplace []allocTuple) {
   459  
   460  	// doInplace manipulates the updates map to make the current allocation
   461  	// an inplace update.
   462  	doInplace := func(cur, last, inplaceCount *int) {
   463  		updates[*cur], updates[*last-1] = updates[*last-1], updates[*cur]
   464  		*cur--
   465  		*last--
   466  		*inplaceCount++
   467  	}
   468  
   469  	ws := memdb.NewWatchSet()
   470  	n := len(updates)
   471  	inplaceCount := 0
   472  	for i := 0; i < n; i++ {
   473  		// Get the update
   474  		update := updates[i]
   475  
   476  		// Check if the task drivers or config has changed, requires
   477  		// a rolling upgrade since that cannot be done in-place.
   478  		existing := update.Alloc.Job
   479  		if tasksUpdated(job, existing, update.TaskGroup.Name) {
   480  			continue
   481  		}
   482  
   483  		// Terminal batch allocations are not filtered when they are completed
   484  		// successfully. We should avoid adding the allocation to the plan in
   485  		// the case that it is an in-place update to avoid both additional data
   486  		// in the plan and work for the clients.
   487  		if update.Alloc.TerminalStatus() {
   488  			doInplace(&i, &n, &inplaceCount)
   489  			continue
   490  		}
   491  
   492  		// Get the existing node
   493  		node, err := ctx.State().NodeByID(ws, update.Alloc.NodeID)
   494  		if err != nil {
   495  			ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v",
   496  				eval, update.Alloc.NodeID, err)
   497  			continue
   498  		}
   499  		if node == nil {
   500  			continue
   501  		}
   502  
   503  		// Set the existing node as the base set
   504  		stack.SetNodes([]*structs.Node{node})
   505  
   506  		// Stage an eviction of the current allocation. This is done so that
   507  		// the current allocation is discounted when checking for feasability.
   508  		// Otherwise we would be trying to fit the tasks current resources and
   509  		// updated resources. After select is called we can remove the evict.
   510  		ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop,
   511  			allocInPlace, "")
   512  
   513  		// Attempt to match the task group
   514  		option, _ := stack.Select(update.TaskGroup, nil) // This select only looks at one node so we don't pass selectOptions
   515  
   516  		// Pop the allocation
   517  		ctx.Plan().PopUpdate(update.Alloc)
   518  
   519  		// Skip if we could not do an in-place update
   520  		if option == nil {
   521  			continue
   522  		}
   523  
   524  		// Restore the network offers from the existing allocation.
   525  		// We do not allow network resources (reserved/dynamic ports)
   526  		// to be updated. This is guarded in taskUpdated, so we can
   527  		// safely restore those here.
   528  		for task, resources := range option.TaskResources {
   529  			existing := update.Alloc.TaskResources[task]
   530  			resources.Networks = existing.Networks
   531  		}
   532  
   533  		// Create a shallow copy
   534  		newAlloc := new(structs.Allocation)
   535  		*newAlloc = *update.Alloc
   536  
   537  		// Update the allocation
   538  		newAlloc.EvalID = eval.ID
   539  		newAlloc.Job = nil       // Use the Job in the Plan
   540  		newAlloc.Resources = nil // Computed in Plan Apply
   541  		newAlloc.TaskResources = option.TaskResources
   542  		newAlloc.Metrics = ctx.Metrics()
   543  		ctx.Plan().AppendAlloc(newAlloc)
   544  
   545  		// Remove this allocation from the slice
   546  		doInplace(&i, &n, &inplaceCount)
   547  	}
   548  
   549  	if len(updates) > 0 {
   550  		ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplaceCount, len(updates))
   551  	}
   552  	return updates[:n], updates[n:]
   553  }
   554  
   555  // evictAndPlace is used to mark allocations for evicts and add them to the
   556  // placement queue. evictAndPlace modifies both the diffResult and the
   557  // limit. It returns true if the limit has been reached.
   558  func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool {
   559  	n := len(allocs)
   560  	for i := 0; i < n && i < *limit; i++ {
   561  		a := allocs[i]
   562  		ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc, "")
   563  		diff.place = append(diff.place, a)
   564  	}
   565  	if n <= *limit {
   566  		*limit -= n
   567  		return false
   568  	}
   569  	*limit = 0
   570  	return true
   571  }
   572  
   573  // tgConstrainTuple is used to store the total constraints of a task group.
   574  type tgConstrainTuple struct {
   575  	// Holds the combined constraints of the task group and all it's sub-tasks.
   576  	constraints []*structs.Constraint
   577  
   578  	// The set of required drivers within the task group.
   579  	drivers map[string]struct{}
   580  
   581  	// The combined resources of all tasks within the task group.
   582  	size *structs.Resources
   583  }
   584  
   585  // taskGroupConstraints collects the constraints, drivers and resources required by each
   586  // sub-task to aggregate the TaskGroup totals
   587  func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple {
   588  	c := tgConstrainTuple{
   589  		constraints: make([]*structs.Constraint, 0, len(tg.Constraints)),
   590  		drivers:     make(map[string]struct{}),
   591  		size:        &structs.Resources{DiskMB: tg.EphemeralDisk.SizeMB},
   592  	}
   593  
   594  	c.constraints = append(c.constraints, tg.Constraints...)
   595  	for _, task := range tg.Tasks {
   596  		c.drivers[task.Driver] = struct{}{}
   597  		c.constraints = append(c.constraints, task.Constraints...)
   598  		c.size.Add(task.Resources)
   599  	}
   600  
   601  	return c
   602  }
   603  
   604  // desiredUpdates takes the diffResult as well as the set of inplace and
   605  // destructive updates and returns a map of task groups to their set of desired
   606  // updates.
   607  func desiredUpdates(diff *diffResult, inplaceUpdates,
   608  	destructiveUpdates []allocTuple) map[string]*structs.DesiredUpdates {
   609  	desiredTgs := make(map[string]*structs.DesiredUpdates)
   610  
   611  	for _, tuple := range diff.place {
   612  		name := tuple.TaskGroup.Name
   613  		des, ok := desiredTgs[name]
   614  		if !ok {
   615  			des = &structs.DesiredUpdates{}
   616  			desiredTgs[name] = des
   617  		}
   618  
   619  		des.Place++
   620  	}
   621  
   622  	for _, tuple := range diff.stop {
   623  		name := tuple.Alloc.TaskGroup
   624  		des, ok := desiredTgs[name]
   625  		if !ok {
   626  			des = &structs.DesiredUpdates{}
   627  			desiredTgs[name] = des
   628  		}
   629  
   630  		des.Stop++
   631  	}
   632  
   633  	for _, tuple := range diff.ignore {
   634  		name := tuple.TaskGroup.Name
   635  		des, ok := desiredTgs[name]
   636  		if !ok {
   637  			des = &structs.DesiredUpdates{}
   638  			desiredTgs[name] = des
   639  		}
   640  
   641  		des.Ignore++
   642  	}
   643  
   644  	for _, tuple := range diff.migrate {
   645  		name := tuple.TaskGroup.Name
   646  		des, ok := desiredTgs[name]
   647  		if !ok {
   648  			des = &structs.DesiredUpdates{}
   649  			desiredTgs[name] = des
   650  		}
   651  
   652  		des.Migrate++
   653  	}
   654  
   655  	for _, tuple := range inplaceUpdates {
   656  		name := tuple.TaskGroup.Name
   657  		des, ok := desiredTgs[name]
   658  		if !ok {
   659  			des = &structs.DesiredUpdates{}
   660  			desiredTgs[name] = des
   661  		}
   662  
   663  		des.InPlaceUpdate++
   664  	}
   665  
   666  	for _, tuple := range destructiveUpdates {
   667  		name := tuple.TaskGroup.Name
   668  		des, ok := desiredTgs[name]
   669  		if !ok {
   670  			des = &structs.DesiredUpdates{}
   671  			desiredTgs[name] = des
   672  		}
   673  
   674  		des.DestructiveUpdate++
   675  	}
   676  
   677  	return desiredTgs
   678  }
   679  
   680  // adjustQueuedAllocations decrements the number of allocations pending per task
   681  // group based on the number of allocations successfully placed
   682  func adjustQueuedAllocations(logger *log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) {
   683  	if result == nil {
   684  		return
   685  	}
   686  
   687  	for _, allocations := range result.NodeAllocation {
   688  		for _, allocation := range allocations {
   689  			// Ensure that the allocation is newly created. We check that
   690  			// the CreateIndex is equal to the ModifyIndex in order to check
   691  			// that the allocation was just created. We do not check that
   692  			// the CreateIndex is equal to the results AllocIndex because
   693  			// the allocations we get back have gone through the planner's
   694  			// optimistic snapshot and thus their indexes may not be
   695  			// correct, but they will be consistent.
   696  			if allocation.CreateIndex != allocation.ModifyIndex {
   697  				continue
   698  			}
   699  
   700  			if _, ok := queuedAllocs[allocation.TaskGroup]; ok {
   701  				queuedAllocs[allocation.TaskGroup]--
   702  			} else {
   703  				logger.Printf("[ERR] sched: allocation %q placed but not in list of unplaced allocations", allocation.TaskGroup)
   704  			}
   705  		}
   706  	}
   707  }
   708  
   709  // updateNonTerminalAllocsToLost updates the allocations which are in pending/running state on tainted node
   710  // to lost
   711  func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*structs.Node, allocs []*structs.Allocation) {
   712  	for _, alloc := range allocs {
   713  		if _, ok := tainted[alloc.NodeID]; ok &&
   714  			alloc.DesiredStatus == structs.AllocDesiredStatusStop &&
   715  			(alloc.ClientStatus == structs.AllocClientStatusRunning ||
   716  				alloc.ClientStatus == structs.AllocClientStatusPending) {
   717  			plan.AppendUpdate(alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost)
   718  		}
   719  	}
   720  }
   721  
   722  // genericAllocUpdateFn is a factory for the scheduler to create an allocUpdateType
   723  // function to be passed into the reconciler. The factory takes objects that
   724  // exist only in the scheduler context and returns a function that can be used
   725  // by the reconciler to make decisions about how to update an allocation. The
   726  // factory allows the reconciler to be unaware of how to determine the type of
   727  // update necessary and can minimize the set of objects it is exposed to.
   728  func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateType {
   729  	return func(existing *structs.Allocation, newJob *structs.Job, newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) {
   730  		// Same index, so nothing to do
   731  		if existing.Job.JobModifyIndex == newJob.JobModifyIndex {
   732  			return true, false, nil
   733  		}
   734  
   735  		// Check if the task drivers or config has changed, requires
   736  		// a destructive upgrade since that cannot be done in-place.
   737  		if tasksUpdated(newJob, existing.Job, newTG.Name) {
   738  			return false, true, nil
   739  		}
   740  
   741  		// Terminal batch allocations are not filtered when they are completed
   742  		// successfully. We should avoid adding the allocation to the plan in
   743  		// the case that it is an in-place update to avoid both additional data
   744  		// in the plan and work for the clients.
   745  		if existing.TerminalStatus() {
   746  			return true, false, nil
   747  		}
   748  
   749  		// Get the existing node
   750  		ws := memdb.NewWatchSet()
   751  		node, err := ctx.State().NodeByID(ws, existing.NodeID)
   752  		if err != nil {
   753  			ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", evalID, existing.NodeID, err)
   754  			return true, false, nil
   755  		}
   756  		if node == nil {
   757  			return false, true, nil
   758  		}
   759  
   760  		// Set the existing node as the base set
   761  		stack.SetNodes([]*structs.Node{node})
   762  
   763  		// Stage an eviction of the current allocation. This is done so that
   764  		// the current allocation is discounted when checking for feasability.
   765  		// Otherwise we would be trying to fit the tasks current resources and
   766  		// updated resources. After select is called we can remove the evict.
   767  		ctx.Plan().AppendUpdate(existing, structs.AllocDesiredStatusStop, allocInPlace, "")
   768  
   769  		// Attempt to match the task group
   770  		option, _ := stack.Select(newTG, nil) // This select only looks at one node so we don't pass selectOptions
   771  
   772  		// Pop the allocation
   773  		ctx.Plan().PopUpdate(existing)
   774  
   775  		// Require destructive if we could not do an in-place update
   776  		if option == nil {
   777  			return false, true, nil
   778  		}
   779  
   780  		// Restore the network offers from the existing allocation.
   781  		// We do not allow network resources (reserved/dynamic ports)
   782  		// to be updated. This is guarded in taskUpdated, so we can
   783  		// safely restore those here.
   784  		for task, resources := range option.TaskResources {
   785  			existingResources := existing.TaskResources[task]
   786  			resources.Networks = existingResources.Networks
   787  		}
   788  
   789  		// Create a shallow copy
   790  		newAlloc := new(structs.Allocation)
   791  		*newAlloc = *existing
   792  
   793  		// Update the allocation
   794  		newAlloc.EvalID = evalID
   795  		newAlloc.Job = nil       // Use the Job in the Plan
   796  		newAlloc.Resources = nil // Computed in Plan Apply
   797  		newAlloc.TaskResources = option.TaskResources
   798  		newAlloc.Metrics = ctx.Metrics()
   799  		return false, false, newAlloc
   800  	}
   801  }