github.com/djenriquez/nomad-1@v0.8.1/scheduler/util.go

github.com/djenriquez/nomad-1@v0.8.1/scheduler/util.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"math/rand"
     7  	"reflect"
     8  
     9  	memdb "github.com/hashicorp/go-memdb"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  // allocTuple is a tuple of the allocation name and potential alloc ID
    14  type allocTuple struct {
    15  	Name      string
    16  	TaskGroup *structs.TaskGroup
    17  	Alloc     *structs.Allocation
    18  }
    19  
    20  // materializeTaskGroups is used to materialize all the task groups
    21  // a job requires. This is used to do the count expansion.
    22  func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup {
    23  	out := make(map[string]*structs.TaskGroup)
    24  	if job.Stopped() {
    25  		return out
    26  	}
    27  
    28  	for _, tg := range job.TaskGroups {
    29  		for i := 0; i < tg.Count; i++ {
    30  			name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i)
    31  			out[name] = tg
    32  		}
    33  	}
    34  	return out
    35  }
    36  
    37  // diffResult is used to return the sets that result from the diff
    38  type diffResult struct {
    39  	place, update, migrate, stop, ignore, lost []allocTuple
    40  }
    41  
    42  func (d *diffResult) GoString() string {
    43  	return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d) (lost %d)",
    44  		len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore), len(d.lost))
    45  }
    46  
    47  func (d *diffResult) Append(other *diffResult) {
    48  	d.place = append(d.place, other.place...)
    49  	d.update = append(d.update, other.update...)
    50  	d.migrate = append(d.migrate, other.migrate...)
    51  	d.stop = append(d.stop, other.stop...)
    52  	d.ignore = append(d.ignore, other.ignore...)
    53  	d.lost = append(d.lost, other.lost...)
    54  }
    55  
    56  // diffAllocs is used to do a set difference between the target allocations
    57  // and the existing allocations. This returns 6 sets of results, the list of
    58  // named task groups that need to be placed (no existing allocation), the
    59  // allocations that need to be updated (job definition is newer), allocs that
    60  // need to be migrated (node is draining), the allocs that need to be evicted
    61  // (no longer required), those that should be ignored and those that are lost
    62  // that need to be replaced (running on a lost node).
    63  //
    64  // job is the job whose allocs is going to be diff-ed.
    65  // taintedNodes is an index of the nodes which are either down or in drain mode
    66  // by name.
    67  // required is a set of allocations that must exist.
    68  // allocs is a list of non terminal allocations.
    69  // terminalAllocs is an index of the latest terminal allocations by name.
    70  func diffAllocs(job *structs.Job, taintedNodes map[string]*structs.Node,
    71  	required map[string]*structs.TaskGroup, allocs []*structs.Allocation,
    72  	terminalAllocs map[string]*structs.Allocation) *diffResult {
    73  	result := &diffResult{}
    74  
    75  	// Scan the existing updates
    76  	existing := make(map[string]struct{})
    77  	for _, exist := range allocs {
    78  		// Index the existing node
    79  		name := exist.Name
    80  		existing[name] = struct{}{}
    81  
    82  		// Check for the definition in the required set
    83  		tg, ok := required[name]
    84  
    85  		// If not required, we stop the alloc
    86  		if !ok {
    87  			result.stop = append(result.stop, allocTuple{
    88  				Name:      name,
    89  				TaskGroup: tg,
    90  				Alloc:     exist,
    91  			})
    92  			continue
    93  		}
    94  
    95  		// If we have been marked for migration and aren't terminal, migrate
    96  		if !exist.TerminalStatus() && exist.DesiredTransition.ShouldMigrate() {
    97  			result.migrate = append(result.migrate, allocTuple{
    98  				Name:      name,
    99  				TaskGroup: tg,
   100  				Alloc:     exist,
   101  			})
   102  			continue
   103  		}
   104  		// If we are on a tainted node, we must migrate if we are a service or
   105  		// if the batch allocation did not finish
   106  		if node, ok := taintedNodes[exist.NodeID]; ok {
   107  			// If the job is batch and finished successfully, the fact that the
   108  			// node is tainted does not mean it should be migrated or marked as
   109  			// lost as the work was already successfully finished. However for
   110  			// service/system jobs, tasks should never complete. The check of
   111  			// batch type, defends against client bugs.
   112  			if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() {
   113  				goto IGNORE
   114  			}
   115  
   116  			if !exist.TerminalStatus() && (node == nil || node.TerminalStatus()) {
   117  				result.lost = append(result.lost, allocTuple{
   118  					Name:      name,
   119  					TaskGroup: tg,
   120  					Alloc:     exist,
   121  				})
   122  			} else {
   123  				goto IGNORE
   124  			}
   125  
   126  			continue
   127  		}
   128  
   129  		// If the definition is updated we need to update
   130  		if job.JobModifyIndex != exist.Job.JobModifyIndex {
   131  			result.update = append(result.update, allocTuple{
   132  				Name:      name,
   133  				TaskGroup: tg,
   134  				Alloc:     exist,
   135  			})
   136  			continue
   137  		}
   138  
   139  		// Everything is up-to-date
   140  	IGNORE:
   141  		result.ignore = append(result.ignore, allocTuple{
   142  			Name:      name,
   143  			TaskGroup: tg,
   144  			Alloc:     exist,
   145  		})
   146  	}
   147  
   148  	// Scan the required groups
   149  	for name, tg := range required {
   150  		// Check for an existing allocation
   151  		_, ok := existing[name]
   152  
   153  		// Require a placement if no existing allocation. If there
   154  		// is an existing allocation, we would have checked for a potential
   155  		// update or ignore above.
   156  		if !ok {
   157  			result.place = append(result.place, allocTuple{
   158  				Name:      name,
   159  				TaskGroup: tg,
   160  				Alloc:     terminalAllocs[name],
   161  			})
   162  		}
   163  	}
   164  	return result
   165  }
   166  
   167  // diffSystemAllocs is like diffAllocs however, the allocations in the
   168  // diffResult contain the specific nodeID they should be allocated on.
   169  //
   170  // job is the job whose allocs is going to be diff-ed.
   171  // nodes is a list of nodes in ready state.
   172  // taintedNodes is an index of the nodes which are either down or in drain mode
   173  // by name.
   174  // allocs is a list of non terminal allocations.
   175  // terminalAllocs is an index of the latest terminal allocations by name.
   176  func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]*structs.Node,
   177  	allocs []*structs.Allocation, terminalAllocs map[string]*structs.Allocation) *diffResult {
   178  
   179  	// Build a mapping of nodes to all their allocs.
   180  	nodeAllocs := make(map[string][]*structs.Allocation, len(allocs))
   181  	for _, alloc := range allocs {
   182  		nallocs := append(nodeAllocs[alloc.NodeID], alloc)
   183  		nodeAllocs[alloc.NodeID] = nallocs
   184  	}
   185  
   186  	for _, node := range nodes {
   187  		if _, ok := nodeAllocs[node.ID]; !ok {
   188  			nodeAllocs[node.ID] = nil
   189  		}
   190  	}
   191  
   192  	// Create the required task groups.
   193  	required := materializeTaskGroups(job)
   194  
   195  	result := &diffResult{}
   196  	for nodeID, allocs := range nodeAllocs {
   197  		diff := diffAllocs(job, taintedNodes, required, allocs, terminalAllocs)
   198  
   199  		// If the node is tainted there should be no placements made
   200  		if _, ok := taintedNodes[nodeID]; ok {
   201  			diff.place = nil
   202  		} else {
   203  			// Mark the alloc as being for a specific node.
   204  			for i := range diff.place {
   205  				alloc := &diff.place[i]
   206  
   207  				// If the new allocation isn't annotated with a previous allocation
   208  				// or if the previous allocation isn't from the same node then we
   209  				// annotate the allocTuple with a new Allocation
   210  				if alloc.Alloc == nil || alloc.Alloc.NodeID != nodeID {
   211  					alloc.Alloc = &structs.Allocation{NodeID: nodeID}
   212  				}
   213  			}
   214  		}
   215  
   216  		result.Append(diff)
   217  	}
   218  
   219  	return result
   220  }
   221  
   222  // readyNodesInDCs returns all the ready nodes in the given datacenters and a
   223  // mapping of each data center to the count of ready nodes.
   224  func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) {
   225  	// Index the DCs
   226  	dcMap := make(map[string]int, len(dcs))
   227  	for _, dc := range dcs {
   228  		dcMap[dc] = 0
   229  	}
   230  
   231  	// Scan the nodes
   232  	ws := memdb.NewWatchSet()
   233  	var out []*structs.Node
   234  	iter, err := state.Nodes(ws)
   235  	if err != nil {
   236  		return nil, nil, err
   237  	}
   238  	for {
   239  		raw := iter.Next()
   240  		if raw == nil {
   241  			break
   242  		}
   243  
   244  		// Filter on datacenter and status
   245  		node := raw.(*structs.Node)
   246  		if node.Status != structs.NodeStatusReady {
   247  			continue
   248  		}
   249  		if node.Drain {
   250  			continue
   251  		}
   252  		if node.SchedulingEligibility != structs.NodeSchedulingEligible {
   253  			continue
   254  		}
   255  		if _, ok := dcMap[node.Datacenter]; !ok {
   256  			continue
   257  		}
   258  		out = append(out, node)
   259  		dcMap[node.Datacenter]++
   260  	}
   261  	return out, dcMap, nil
   262  }
   263  
   264  // retryMax is used to retry a callback until it returns success or
   265  // a maximum number of attempts is reached. An optional reset function may be
   266  // passed which is called after each failed iteration. If the reset function is
   267  // set and returns true, the number of attempts is reset back to max.
   268  func retryMax(max int, cb func() (bool, error), reset func() bool) error {
   269  	attempts := 0
   270  	for attempts < max {
   271  		done, err := cb()
   272  		if err != nil {
   273  			return err
   274  		}
   275  		if done {
   276  			return nil
   277  		}
   278  
   279  		// Check if we should reset the number attempts
   280  		if reset != nil && reset() {
   281  			attempts = 0
   282  		} else {
   283  			attempts++
   284  		}
   285  	}
   286  	return &SetStatusError{
   287  		Err:        fmt.Errorf("maximum attempts reached (%d)", max),
   288  		EvalStatus: structs.EvalStatusFailed,
   289  	}
   290  }
   291  
   292  // progressMade checks to see if the plan result made allocations or updates.
   293  // If the result is nil, false is returned.
   294  func progressMade(result *structs.PlanResult) bool {
   295  	return result != nil && (len(result.NodeUpdate) != 0 ||
   296  		len(result.NodeAllocation) != 0 || result.Deployment != nil ||
   297  		len(result.DeploymentUpdates) != 0)
   298  }
   299  
   300  // taintedNodes is used to scan the allocations and then check if the
   301  // underlying nodes are tainted, and should force a migration of the allocation.
   302  // All the nodes returned in the map are tainted.
   303  func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*structs.Node, error) {
   304  	out := make(map[string]*structs.Node)
   305  	for _, alloc := range allocs {
   306  		if _, ok := out[alloc.NodeID]; ok {
   307  			continue
   308  		}
   309  
   310  		ws := memdb.NewWatchSet()
   311  		node, err := state.NodeByID(ws, alloc.NodeID)
   312  		if err != nil {
   313  			return nil, err
   314  		}
   315  
   316  		// If the node does not exist, we should migrate
   317  		if node == nil {
   318  			out[alloc.NodeID] = nil
   319  			continue
   320  		}
   321  		if structs.ShouldDrainNode(node.Status) || node.Drain {
   322  			out[alloc.NodeID] = node
   323  		}
   324  	}
   325  	return out, nil
   326  }
   327  
   328  // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm
   329  func shuffleNodes(nodes []*structs.Node) {
   330  	n := len(nodes)
   331  	for i := n - 1; i > 0; i-- {
   332  		j := rand.Intn(i + 1)
   333  		nodes[i], nodes[j] = nodes[j], nodes[i]
   334  	}
   335  }
   336  
   337  // tasksUpdated does a diff between task groups to see if the
   338  // tasks, their drivers, environment variables or config have updated. The
   339  // inputs are the task group name to diff and two jobs to diff.
   340  func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
   341  	a := jobA.LookupTaskGroup(taskGroup)
   342  	b := jobB.LookupTaskGroup(taskGroup)
   343  
   344  	// If the number of tasks do not match, clearly there is an update
   345  	if len(a.Tasks) != len(b.Tasks) {
   346  		return true
   347  	}
   348  
   349  	// Check ephemeral disk
   350  	if !reflect.DeepEqual(a.EphemeralDisk, b.EphemeralDisk) {
   351  		return true
   352  	}
   353  
   354  	// Check each task
   355  	for _, at := range a.Tasks {
   356  		bt := b.LookupTask(at.Name)
   357  		if bt == nil {
   358  			return true
   359  		}
   360  		if at.Driver != bt.Driver {
   361  			return true
   362  		}
   363  		if at.User != bt.User {
   364  			return true
   365  		}
   366  		if !reflect.DeepEqual(at.Config, bt.Config) {
   367  			return true
   368  		}
   369  		if !reflect.DeepEqual(at.Env, bt.Env) {
   370  			return true
   371  		}
   372  		if !reflect.DeepEqual(at.Artifacts, bt.Artifacts) {
   373  			return true
   374  		}
   375  		if !reflect.DeepEqual(at.Vault, bt.Vault) {
   376  			return true
   377  		}
   378  		if !reflect.DeepEqual(at.Templates, bt.Templates) {
   379  			return true
   380  		}
   381  
   382  		// Check the metadata
   383  		if !reflect.DeepEqual(
   384  			jobA.CombinedTaskMeta(taskGroup, at.Name),
   385  			jobB.CombinedTaskMeta(taskGroup, bt.Name)) {
   386  			return true
   387  		}
   388  
   389  		// Inspect the network to see if the dynamic ports are different
   390  		if len(at.Resources.Networks) != len(bt.Resources.Networks) {
   391  			return true
   392  		}
   393  		for idx := range at.Resources.Networks {
   394  			an := at.Resources.Networks[idx]
   395  			bn := bt.Resources.Networks[idx]
   396  
   397  			if an.MBits != bn.MBits {
   398  				return true
   399  			}
   400  
   401  			aPorts, bPorts := networkPortMap(an), networkPortMap(bn)
   402  			if !reflect.DeepEqual(aPorts, bPorts) {
   403  				return true
   404  			}
   405  		}
   406  
   407  		// Inspect the non-network resources
   408  		if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU {
   409  			return true
   410  		} else if ar.MemoryMB != br.MemoryMB {
   411  			return true
   412  		} else if ar.IOPS != br.IOPS {
   413  			return true
   414  		}
   415  	}
   416  	return false
   417  }
   418  
   419  // networkPortMap takes a network resource and returns a map of port labels to
   420  // values. The value for dynamic ports is disregarded even if it is set. This
   421  // makes this function suitable for comparing two network resources for changes.
   422  func networkPortMap(n *structs.NetworkResource) map[string]int {
   423  	m := make(map[string]int, len(n.DynamicPorts)+len(n.ReservedPorts))
   424  	for _, p := range n.ReservedPorts {
   425  		m[p.Label] = p.Value
   426  	}
   427  	for _, p := range n.DynamicPorts {
   428  		m[p.Label] = -1
   429  	}
   430  	return m
   431  }
   432  
   433  // setStatus is used to update the status of the evaluation
   434  func setStatus(logger *log.Logger, planner Planner,
   435  	eval, nextEval, spawnedBlocked *structs.Evaluation,
   436  	tgMetrics map[string]*structs.AllocMetric, status, desc string,
   437  	queuedAllocs map[string]int, deploymentID string) error {
   438  
   439  	logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status)
   440  	newEval := eval.Copy()
   441  	newEval.Status = status
   442  	newEval.StatusDescription = desc
   443  	newEval.DeploymentID = deploymentID
   444  	newEval.FailedTGAllocs = tgMetrics
   445  	if nextEval != nil {
   446  		newEval.NextEval = nextEval.ID
   447  	}
   448  	if spawnedBlocked != nil {
   449  		newEval.BlockedEval = spawnedBlocked.ID
   450  	}
   451  	if queuedAllocs != nil {
   452  		newEval.QueuedAllocations = queuedAllocs
   453  	}
   454  
   455  	return planner.UpdateEval(newEval)
   456  }
   457  
   458  // inplaceUpdate attempts to update allocations in-place where possible. It
   459  // returns the allocs that couldn't be done inplace and then those that could.
   460  func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job,
   461  	stack Stack, updates []allocTuple) (destructive, inplace []allocTuple) {
   462  
   463  	// doInplace manipulates the updates map to make the current allocation
   464  	// an inplace update.
   465  	doInplace := func(cur, last, inplaceCount *int) {
   466  		updates[*cur], updates[*last-1] = updates[*last-1], updates[*cur]
   467  		*cur--
   468  		*last--
   469  		*inplaceCount++
   470  	}
   471  
   472  	ws := memdb.NewWatchSet()
   473  	n := len(updates)
   474  	inplaceCount := 0
   475  	for i := 0; i < n; i++ {
   476  		// Get the update
   477  		update := updates[i]
   478  
   479  		// Check if the task drivers or config has changed, requires
   480  		// a rolling upgrade since that cannot be done in-place.
   481  		existing := update.Alloc.Job
   482  		if tasksUpdated(job, existing, update.TaskGroup.Name) {
   483  			continue
   484  		}
   485  
   486  		// Terminal batch allocations are not filtered when they are completed
   487  		// successfully. We should avoid adding the allocation to the plan in
   488  		// the case that it is an in-place update to avoid both additional data
   489  		// in the plan and work for the clients.
   490  		if update.Alloc.TerminalStatus() {
   491  			doInplace(&i, &n, &inplaceCount)
   492  			continue
   493  		}
   494  
   495  		// Get the existing node
   496  		node, err := ctx.State().NodeByID(ws, update.Alloc.NodeID)
   497  		if err != nil {
   498  			ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v",
   499  				eval, update.Alloc.NodeID, err)
   500  			continue
   501  		}
   502  		if node == nil {
   503  			continue
   504  		}
   505  
   506  		// Set the existing node as the base set
   507  		stack.SetNodes([]*structs.Node{node})
   508  
   509  		// Stage an eviction of the current allocation. This is done so that
   510  		// the current allocation is discounted when checking for feasibility.
   511  		// Otherwise we would be trying to fit the tasks current resources and
   512  		// updated resources. After select is called we can remove the evict.
   513  		ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop,
   514  			allocInPlace, "")
   515  
   516  		// Attempt to match the task group
   517  		option, _ := stack.Select(update.TaskGroup, nil) // This select only looks at one node so we don't pass selectOptions
   518  
   519  		// Pop the allocation
   520  		ctx.Plan().PopUpdate(update.Alloc)
   521  
   522  		// Skip if we could not do an in-place update
   523  		if option == nil {
   524  			continue
   525  		}
   526  
   527  		// Restore the network offers from the existing allocation.
   528  		// We do not allow network resources (reserved/dynamic ports)
   529  		// to be updated. This is guarded in taskUpdated, so we can
   530  		// safely restore those here.
   531  		for task, resources := range option.TaskResources {
   532  			existing := update.Alloc.TaskResources[task]
   533  			resources.Networks = existing.Networks
   534  		}
   535  
   536  		// Create a shallow copy
   537  		newAlloc := new(structs.Allocation)
   538  		*newAlloc = *update.Alloc
   539  
   540  		// Update the allocation
   541  		newAlloc.EvalID = eval.ID
   542  		newAlloc.Job = nil       // Use the Job in the Plan
   543  		newAlloc.Resources = nil // Computed in Plan Apply
   544  		newAlloc.TaskResources = option.TaskResources
   545  		newAlloc.Metrics = ctx.Metrics()
   546  		ctx.Plan().AppendAlloc(newAlloc)
   547  
   548  		// Remove this allocation from the slice
   549  		doInplace(&i, &n, &inplaceCount)
   550  	}
   551  
   552  	if len(updates) > 0 {
   553  		ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplaceCount, len(updates))
   554  	}
   555  	return updates[:n], updates[n:]
   556  }
   557  
   558  // evictAndPlace is used to mark allocations for evicts and add them to the
   559  // placement queue. evictAndPlace modifies both the diffResult and the
   560  // limit. It returns true if the limit has been reached.
   561  func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool {
   562  	n := len(allocs)
   563  	for i := 0; i < n && i < *limit; i++ {
   564  		a := allocs[i]
   565  		ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc, "")
   566  		diff.place = append(diff.place, a)
   567  	}
   568  	if n <= *limit {
   569  		*limit -= n
   570  		return false
   571  	}
   572  	*limit = 0
   573  	return true
   574  }
   575  
   576  // tgConstrainTuple is used to store the total constraints of a task group.
   577  type tgConstrainTuple struct {
   578  	// Holds the combined constraints of the task group and all it's sub-tasks.
   579  	constraints []*structs.Constraint
   580  
   581  	// The set of required drivers within the task group.
   582  	drivers map[string]struct{}
   583  
   584  	// The combined resources of all tasks within the task group.
   585  	size *structs.Resources
   586  }
   587  
   588  // taskGroupConstraints collects the constraints, drivers and resources required by each
   589  // sub-task to aggregate the TaskGroup totals
   590  func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple {
   591  	c := tgConstrainTuple{
   592  		constraints: make([]*structs.Constraint, 0, len(tg.Constraints)),
   593  		drivers:     make(map[string]struct{}),
   594  		size:        &structs.Resources{DiskMB: tg.EphemeralDisk.SizeMB},
   595  	}
   596  
   597  	c.constraints = append(c.constraints, tg.Constraints...)
   598  	for _, task := range tg.Tasks {
   599  		c.drivers[task.Driver] = struct{}{}
   600  		c.constraints = append(c.constraints, task.Constraints...)
   601  		c.size.Add(task.Resources)
   602  	}
   603  
   604  	return c
   605  }
   606  
   607  // desiredUpdates takes the diffResult as well as the set of inplace and
   608  // destructive updates and returns a map of task groups to their set of desired
   609  // updates.
   610  func desiredUpdates(diff *diffResult, inplaceUpdates,
   611  	destructiveUpdates []allocTuple) map[string]*structs.DesiredUpdates {
   612  	desiredTgs := make(map[string]*structs.DesiredUpdates)
   613  
   614  	for _, tuple := range diff.place {
   615  		name := tuple.TaskGroup.Name
   616  		des, ok := desiredTgs[name]
   617  		if !ok {
   618  			des = &structs.DesiredUpdates{}
   619  			desiredTgs[name] = des
   620  		}
   621  
   622  		des.Place++
   623  	}
   624  
   625  	for _, tuple := range diff.stop {
   626  		name := tuple.Alloc.TaskGroup
   627  		des, ok := desiredTgs[name]
   628  		if !ok {
   629  			des = &structs.DesiredUpdates{}
   630  			desiredTgs[name] = des
   631  		}
   632  
   633  		des.Stop++
   634  	}
   635  
   636  	for _, tuple := range diff.ignore {
   637  		name := tuple.TaskGroup.Name
   638  		des, ok := desiredTgs[name]
   639  		if !ok {
   640  			des = &structs.DesiredUpdates{}
   641  			desiredTgs[name] = des
   642  		}
   643  
   644  		des.Ignore++
   645  	}
   646  
   647  	for _, tuple := range diff.migrate {
   648  		name := tuple.TaskGroup.Name
   649  		des, ok := desiredTgs[name]
   650  		if !ok {
   651  			des = &structs.DesiredUpdates{}
   652  			desiredTgs[name] = des
   653  		}
   654  
   655  		des.Migrate++
   656  	}
   657  
   658  	for _, tuple := range inplaceUpdates {
   659  		name := tuple.TaskGroup.Name
   660  		des, ok := desiredTgs[name]
   661  		if !ok {
   662  			des = &structs.DesiredUpdates{}
   663  			desiredTgs[name] = des
   664  		}
   665  
   666  		des.InPlaceUpdate++
   667  	}
   668  
   669  	for _, tuple := range destructiveUpdates {
   670  		name := tuple.TaskGroup.Name
   671  		des, ok := desiredTgs[name]
   672  		if !ok {
   673  			des = &structs.DesiredUpdates{}
   674  			desiredTgs[name] = des
   675  		}
   676  
   677  		des.DestructiveUpdate++
   678  	}
   679  
   680  	return desiredTgs
   681  }
   682  
   683  // adjustQueuedAllocations decrements the number of allocations pending per task
   684  // group based on the number of allocations successfully placed
   685  func adjustQueuedAllocations(logger *log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) {
   686  	if result == nil {
   687  		return
   688  	}
   689  
   690  	for _, allocations := range result.NodeAllocation {
   691  		for _, allocation := range allocations {
   692  			// Ensure that the allocation is newly created. We check that
   693  			// the CreateIndex is equal to the ModifyIndex in order to check
   694  			// that the allocation was just created. We do not check that
   695  			// the CreateIndex is equal to the results AllocIndex because
   696  			// the allocations we get back have gone through the planner's
   697  			// optimistic snapshot and thus their indexes may not be
   698  			// correct, but they will be consistent.
   699  			if allocation.CreateIndex != allocation.ModifyIndex {
   700  				continue
   701  			}
   702  
   703  			if _, ok := queuedAllocs[allocation.TaskGroup]; ok {
   704  				queuedAllocs[allocation.TaskGroup]--
   705  			} else {
   706  				logger.Printf("[ERR] sched: allocation %q placed but not in list of unplaced allocations", allocation.TaskGroup)
   707  			}
   708  		}
   709  	}
   710  }
   711  
   712  // updateNonTerminalAllocsToLost updates the allocations which are in pending/running state on tainted node
   713  // to lost
   714  func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*structs.Node, allocs []*structs.Allocation) {
   715  	for _, alloc := range allocs {
   716  		node, ok := tainted[alloc.NodeID]
   717  		if !ok {
   718  			continue
   719  		}
   720  
   721  		// Only handle down nodes or nodes that are gone (node == nil)
   722  		if node != nil && node.Status != structs.NodeStatusDown {
   723  			continue
   724  		}
   725  
   726  		// If the scheduler has marked it as stop already but the alloc wasn't
   727  		// terminal on the client change the status to lost.
   728  		if alloc.DesiredStatus == structs.AllocDesiredStatusStop &&
   729  			(alloc.ClientStatus == structs.AllocClientStatusRunning ||
   730  				alloc.ClientStatus == structs.AllocClientStatusPending) {
   731  			plan.AppendUpdate(alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost)
   732  		}
   733  	}
   734  }
   735  
   736  // genericAllocUpdateFn is a factory for the scheduler to create an allocUpdateType
   737  // function to be passed into the reconciler. The factory takes objects that
   738  // exist only in the scheduler context and returns a function that can be used
   739  // by the reconciler to make decisions about how to update an allocation. The
   740  // factory allows the reconciler to be unaware of how to determine the type of
   741  // update necessary and can minimize the set of objects it is exposed to.
   742  func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateType {
   743  	return func(existing *structs.Allocation, newJob *structs.Job, newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) {
   744  		// Same index, so nothing to do
   745  		if existing.Job.JobModifyIndex == newJob.JobModifyIndex {
   746  			return true, false, nil
   747  		}
   748  
   749  		// Check if the task drivers or config has changed, requires
   750  		// a destructive upgrade since that cannot be done in-place.
   751  		if tasksUpdated(newJob, existing.Job, newTG.Name) {
   752  			return false, true, nil
   753  		}
   754  
   755  		// Terminal batch allocations are not filtered when they are completed
   756  		// successfully. We should avoid adding the allocation to the plan in
   757  		// the case that it is an in-place update to avoid both additional data
   758  		// in the plan and work for the clients.
   759  		if existing.TerminalStatus() {
   760  			return true, false, nil
   761  		}
   762  
   763  		// Get the existing node
   764  		ws := memdb.NewWatchSet()
   765  		node, err := ctx.State().NodeByID(ws, existing.NodeID)
   766  		if err != nil {
   767  			ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", evalID, existing.NodeID, err)
   768  			return true, false, nil
   769  		}
   770  		if node == nil {
   771  			return false, true, nil
   772  		}
   773  
   774  		// Set the existing node as the base set
   775  		stack.SetNodes([]*structs.Node{node})
   776  
   777  		// Stage an eviction of the current allocation. This is done so that
   778  		// the current allocation is discounted when checking for feasibility.
   779  		// Otherwise we would be trying to fit the tasks current resources and
   780  		// updated resources. After select is called we can remove the evict.
   781  		ctx.Plan().AppendUpdate(existing, structs.AllocDesiredStatusStop, allocInPlace, "")
   782  
   783  		// Attempt to match the task group
   784  		option, _ := stack.Select(newTG, nil) // This select only looks at one node so we don't pass selectOptions
   785  
   786  		// Pop the allocation
   787  		ctx.Plan().PopUpdate(existing)
   788  
   789  		// Require destructive if we could not do an in-place update
   790  		if option == nil {
   791  			return false, true, nil
   792  		}
   793  
   794  		// Restore the network offers from the existing allocation.
   795  		// We do not allow network resources (reserved/dynamic ports)
   796  		// to be updated. This is guarded in taskUpdated, so we can
   797  		// safely restore those here.
   798  		for task, resources := range option.TaskResources {
   799  			existingResources := existing.TaskResources[task]
   800  			resources.Networks = existingResources.Networks
   801  		}
   802  
   803  		// Create a shallow copy
   804  		newAlloc := new(structs.Allocation)
   805  		*newAlloc = *existing
   806  
   807  		// Update the allocation
   808  		newAlloc.EvalID = evalID
   809  		newAlloc.Job = nil       // Use the Job in the Plan
   810  		newAlloc.Resources = nil // Computed in Plan Apply
   811  		newAlloc.TaskResources = option.TaskResources
   812  		newAlloc.Metrics = ctx.Metrics()
   813  		return false, false, newAlloc
   814  	}
   815  }