github.com/jrxfive/nomad@v0.6.1-0.20170802162750-1fef470e89bf/scheduler/util.go

github.com/jrxfive/nomad@v0.6.1-0.20170802162750-1fef470e89bf/scheduler/util.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"math/rand"
     7  	"reflect"
     8  
     9  	memdb "github.com/hashicorp/go-memdb"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  // allocTuple is a tuple of the allocation name and potential alloc ID
    14  type allocTuple struct {
    15  	Name      string
    16  	TaskGroup *structs.TaskGroup
    17  	Alloc     *structs.Allocation
    18  }
    19  
    20  // materializeTaskGroups is used to materialize all the task groups
    21  // a job requires. This is used to do the count expansion.
    22  func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup {
    23  	out := make(map[string]*structs.TaskGroup)
    24  	if job.Stopped() {
    25  		return out
    26  	}
    27  
    28  	for _, tg := range job.TaskGroups {
    29  		for i := 0; i < tg.Count; i++ {
    30  			name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i)
    31  			out[name] = tg
    32  		}
    33  	}
    34  	return out
    35  }
    36  
    37  // diffResult is used to return the sets that result from the diff
    38  type diffResult struct {
    39  	place, update, migrate, stop, ignore, lost []allocTuple
    40  }
    41  
    42  func (d *diffResult) GoString() string {
    43  	return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d) (lost %d)",
    44  		len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore), len(d.lost))
    45  }
    46  
    47  func (d *diffResult) Append(other *diffResult) {
    48  	d.place = append(d.place, other.place...)
    49  	d.update = append(d.update, other.update...)
    50  	d.migrate = append(d.migrate, other.migrate...)
    51  	d.stop = append(d.stop, other.stop...)
    52  	d.ignore = append(d.ignore, other.ignore...)
    53  	d.lost = append(d.lost, other.lost...)
    54  }
    55  
    56  // diffAllocs is used to do a set difference between the target allocations
    57  // and the existing allocations. This returns 6 sets of results, the list of
    58  // named task groups that need to be placed (no existing allocation), the
    59  // allocations that need to be updated (job definition is newer), allocs that
    60  // need to be migrated (node is draining), the allocs that need to be evicted
    61  // (no longer required), those that should be ignored and those that are lost
    62  // that need to be replaced (running on a lost node).
    63  //
    64  // job is the job whose allocs is going to be diff-ed.
    65  // taintedNodes is an index of the nodes which are either down or in drain mode
    66  // by name.
    67  // required is a set of allocations that must exist.
    68  // allocs is a list of non terminal allocations.
    69  // terminalAllocs is an index of the latest terminal allocations by name.
    70  func diffAllocs(job *structs.Job, taintedNodes map[string]*structs.Node,
    71  	required map[string]*structs.TaskGroup, allocs []*structs.Allocation,
    72  	terminalAllocs map[string]*structs.Allocation) *diffResult {
    73  	result := &diffResult{}
    74  
    75  	// Scan the existing updates
    76  	existing := make(map[string]struct{})
    77  	for _, exist := range allocs {
    78  		// Index the existing node
    79  		name := exist.Name
    80  		existing[name] = struct{}{}
    81  
    82  		// Check for the definition in the required set
    83  		tg, ok := required[name]
    84  
    85  		// If not required, we stop the alloc
    86  		if !ok {
    87  			result.stop = append(result.stop, allocTuple{
    88  				Name:      name,
    89  				TaskGroup: tg,
    90  				Alloc:     exist,
    91  			})
    92  			continue
    93  		}
    94  
    95  		// If we are on a tainted node, we must migrate if we are a service or
    96  		// if the batch allocation did not finish
    97  		if node, ok := taintedNodes[exist.NodeID]; ok {
    98  			// If the job is batch and finished successfully, the fact that the
    99  			// node is tainted does not mean it should be migrated or marked as
   100  			// lost as the work was already successfully finished. However for
   101  			// service/system jobs, tasks should never complete. The check of
   102  			// batch type, defends against client bugs.
   103  			if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() {
   104  				goto IGNORE
   105  			}
   106  
   107  			if node == nil || node.TerminalStatus() {
   108  				result.lost = append(result.lost, allocTuple{
   109  					Name:      name,
   110  					TaskGroup: tg,
   111  					Alloc:     exist,
   112  				})
   113  			} else {
   114  				// This is the drain case
   115  				result.migrate = append(result.migrate, allocTuple{
   116  					Name:      name,
   117  					TaskGroup: tg,
   118  					Alloc:     exist,
   119  				})
   120  			}
   121  			continue
   122  		}
   123  
   124  		// If the definition is updated we need to update
   125  		if job.JobModifyIndex != exist.Job.JobModifyIndex {
   126  			result.update = append(result.update, allocTuple{
   127  				Name:      name,
   128  				TaskGroup: tg,
   129  				Alloc:     exist,
   130  			})
   131  			continue
   132  		}
   133  
   134  		// Everything is up-to-date
   135  	IGNORE:
   136  		result.ignore = append(result.ignore, allocTuple{
   137  			Name:      name,
   138  			TaskGroup: tg,
   139  			Alloc:     exist,
   140  		})
   141  	}
   142  
   143  	// Scan the required groups
   144  	for name, tg := range required {
   145  		// Check for an existing allocation
   146  		_, ok := existing[name]
   147  
   148  		// Require a placement if no existing allocation. If there
   149  		// is an existing allocation, we would have checked for a potential
   150  		// update or ignore above.
   151  		if !ok {
   152  			result.place = append(result.place, allocTuple{
   153  				Name:      name,
   154  				TaskGroup: tg,
   155  				Alloc:     terminalAllocs[name],
   156  			})
   157  		}
   158  	}
   159  	return result
   160  }
   161  
   162  // diffSystemAllocs is like diffAllocs however, the allocations in the
   163  // diffResult contain the specific nodeID they should be allocated on.
   164  //
   165  // job is the job whose allocs is going to be diff-ed.
   166  // nodes is a list of nodes in ready state.
   167  // taintedNodes is an index of the nodes which are either down or in drain mode
   168  // by name.
   169  // allocs is a list of non terminal allocations.
   170  // terminalAllocs is an index of the latest terminal allocations by name.
   171  func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]*structs.Node,
   172  	allocs []*structs.Allocation, terminalAllocs map[string]*structs.Allocation) *diffResult {
   173  
   174  	// Build a mapping of nodes to all their allocs.
   175  	nodeAllocs := make(map[string][]*structs.Allocation, len(allocs))
   176  	for _, alloc := range allocs {
   177  		nallocs := append(nodeAllocs[alloc.NodeID], alloc)
   178  		nodeAllocs[alloc.NodeID] = nallocs
   179  	}
   180  
   181  	for _, node := range nodes {
   182  		if _, ok := nodeAllocs[node.ID]; !ok {
   183  			nodeAllocs[node.ID] = nil
   184  		}
   185  	}
   186  
   187  	// Create the required task groups.
   188  	required := materializeTaskGroups(job)
   189  
   190  	result := &diffResult{}
   191  	for nodeID, allocs := range nodeAllocs {
   192  		diff := diffAllocs(job, taintedNodes, required, allocs, terminalAllocs)
   193  
   194  		// If the node is tainted there should be no placements made
   195  		if _, ok := taintedNodes[nodeID]; ok {
   196  			diff.place = nil
   197  		} else {
   198  			// Mark the alloc as being for a specific node.
   199  			for i := range diff.place {
   200  				alloc := &diff.place[i]
   201  
   202  				// If the new allocation isn't annotated with a previous allocation
   203  				// or if the previous allocation isn't from the same node then we
   204  				// annotate the allocTuple with a new Allocation
   205  				if alloc.Alloc == nil || alloc.Alloc.NodeID != nodeID {
   206  					alloc.Alloc = &structs.Allocation{NodeID: nodeID}
   207  				}
   208  			}
   209  		}
   210  
   211  		// Migrate does not apply to system jobs and instead should be marked as
   212  		// stop because if a node is tainted, the job is invalid on that node.
   213  		diff.stop = append(diff.stop, diff.migrate...)
   214  		diff.migrate = nil
   215  
   216  		result.Append(diff)
   217  	}
   218  
   219  	return result
   220  }
   221  
   222  // readyNodesInDCs returns all the ready nodes in the given datacenters and a
   223  // mapping of each data center to the count of ready nodes.
   224  func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) {
   225  	// Index the DCs
   226  	dcMap := make(map[string]int, len(dcs))
   227  	for _, dc := range dcs {
   228  		dcMap[dc] = 0
   229  	}
   230  
   231  	// Scan the nodes
   232  	ws := memdb.NewWatchSet()
   233  	var out []*structs.Node
   234  	iter, err := state.Nodes(ws)
   235  	if err != nil {
   236  		return nil, nil, err
   237  	}
   238  	for {
   239  		raw := iter.Next()
   240  		if raw == nil {
   241  			break
   242  		}
   243  
   244  		// Filter on datacenter and status
   245  		node := raw.(*structs.Node)
   246  		if node.Status != structs.NodeStatusReady {
   247  			continue
   248  		}
   249  		if node.Drain {
   250  			continue
   251  		}
   252  		if _, ok := dcMap[node.Datacenter]; !ok {
   253  			continue
   254  		}
   255  		out = append(out, node)
   256  		dcMap[node.Datacenter] += 1
   257  	}
   258  	return out, dcMap, nil
   259  }
   260  
   261  // retryMax is used to retry a callback until it returns success or
   262  // a maximum number of attempts is reached. An optional reset function may be
   263  // passed which is called after each failed iteration. If the reset function is
   264  // set and returns true, the number of attempts is reset back to max.
   265  func retryMax(max int, cb func() (bool, error), reset func() bool) error {
   266  	attempts := 0
   267  	for attempts < max {
   268  		done, err := cb()
   269  		if err != nil {
   270  			return err
   271  		}
   272  		if done {
   273  			return nil
   274  		}
   275  
   276  		// Check if we should reset the number attempts
   277  		if reset != nil && reset() {
   278  			attempts = 0
   279  		} else {
   280  			attempts += 1
   281  		}
   282  	}
   283  	return &SetStatusError{
   284  		Err:        fmt.Errorf("maximum attempts reached (%d)", max),
   285  		EvalStatus: structs.EvalStatusFailed,
   286  	}
   287  }
   288  
   289  // progressMade checks to see if the plan result made allocations or updates.
   290  // If the result is nil, false is returned.
   291  func progressMade(result *structs.PlanResult) bool {
   292  	return result != nil && (len(result.NodeUpdate) != 0 ||
   293  		len(result.NodeAllocation) != 0 || result.Deployment != nil ||
   294  		len(result.DeploymentUpdates) != 0)
   295  }
   296  
   297  // taintedNodes is used to scan the allocations and then check if the
   298  // underlying nodes are tainted, and should force a migration of the allocation.
   299  // All the nodes returned in the map are tainted.
   300  func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*structs.Node, error) {
   301  	out := make(map[string]*structs.Node)
   302  	for _, alloc := range allocs {
   303  		if _, ok := out[alloc.NodeID]; ok {
   304  			continue
   305  		}
   306  
   307  		ws := memdb.NewWatchSet()
   308  		node, err := state.NodeByID(ws, alloc.NodeID)
   309  		if err != nil {
   310  			return nil, err
   311  		}
   312  
   313  		// If the node does not exist, we should migrate
   314  		if node == nil {
   315  			out[alloc.NodeID] = nil
   316  			continue
   317  		}
   318  		if structs.ShouldDrainNode(node.Status) || node.Drain {
   319  			out[alloc.NodeID] = node
   320  		}
   321  	}
   322  	return out, nil
   323  }
   324  
   325  // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm
   326  func shuffleNodes(nodes []*structs.Node) {
   327  	n := len(nodes)
   328  	for i := n - 1; i > 0; i-- {
   329  		j := rand.Intn(i + 1)
   330  		nodes[i], nodes[j] = nodes[j], nodes[i]
   331  	}
   332  }
   333  
   334  // tasksUpdated does a diff between task groups to see if the
   335  // tasks, their drivers, environment variables or config have updated. The
   336  // inputs are the task group name to diff and two jobs to diff.
   337  func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
   338  	a := jobA.LookupTaskGroup(taskGroup)
   339  	b := jobB.LookupTaskGroup(taskGroup)
   340  
   341  	// If the number of tasks do not match, clearly there is an update
   342  	if len(a.Tasks) != len(b.Tasks) {
   343  		return true
   344  	}
   345  
   346  	// Check ephemeral disk
   347  	if !reflect.DeepEqual(a.EphemeralDisk, b.EphemeralDisk) {
   348  		return true
   349  	}
   350  
   351  	// Check each task
   352  	for _, at := range a.Tasks {
   353  		bt := b.LookupTask(at.Name)
   354  		if bt == nil {
   355  			return true
   356  		}
   357  		if at.Driver != bt.Driver {
   358  			return true
   359  		}
   360  		if at.User != bt.User {
   361  			return true
   362  		}
   363  		if !reflect.DeepEqual(at.Config, bt.Config) {
   364  			return true
   365  		}
   366  		if !reflect.DeepEqual(at.Env, bt.Env) {
   367  			return true
   368  		}
   369  		if !reflect.DeepEqual(at.Artifacts, bt.Artifacts) {
   370  			return true
   371  		}
   372  		if !reflect.DeepEqual(at.Vault, bt.Vault) {
   373  			return true
   374  		}
   375  		if !reflect.DeepEqual(at.Templates, bt.Templates) {
   376  			return true
   377  		}
   378  
   379  		// Check the metadata
   380  		if !reflect.DeepEqual(
   381  			jobA.CombinedTaskMeta(taskGroup, at.Name),
   382  			jobB.CombinedTaskMeta(taskGroup, bt.Name)) {
   383  			return true
   384  		}
   385  
   386  		// Inspect the network to see if the dynamic ports are different
   387  		if len(at.Resources.Networks) != len(bt.Resources.Networks) {
   388  			return true
   389  		}
   390  		for idx := range at.Resources.Networks {
   391  			an := at.Resources.Networks[idx]
   392  			bn := bt.Resources.Networks[idx]
   393  
   394  			if an.MBits != bn.MBits {
   395  				return true
   396  			}
   397  
   398  			aPorts, bPorts := networkPortMap(an), networkPortMap(bn)
   399  			if !reflect.DeepEqual(aPorts, bPorts) {
   400  				return true
   401  			}
   402  		}
   403  
   404  		// Inspect the non-network resources
   405  		if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU {
   406  			return true
   407  		} else if ar.MemoryMB != br.MemoryMB {
   408  			return true
   409  		} else if ar.IOPS != br.IOPS {
   410  			return true
   411  		}
   412  	}
   413  	return false
   414  }
   415  
   416  // networkPortMap takes a network resource and returns a map of port labels to
   417  // values. The value for dynamic ports is disregarded even if it is set. This
   418  // makes this function suitable for comparing two network resources for changes.
   419  func networkPortMap(n *structs.NetworkResource) map[string]int {
   420  	m := make(map[string]int, len(n.DynamicPorts)+len(n.ReservedPorts))
   421  	for _, p := range n.ReservedPorts {
   422  		m[p.Label] = p.Value
   423  	}
   424  	for _, p := range n.DynamicPorts {
   425  		m[p.Label] = -1
   426  	}
   427  	return m
   428  }
   429  
   430  // setStatus is used to update the status of the evaluation
   431  func setStatus(logger *log.Logger, planner Planner,
   432  	eval, nextEval, spawnedBlocked *structs.Evaluation,
   433  	tgMetrics map[string]*structs.AllocMetric, status, desc string,
   434  	queuedAllocs map[string]int, deploymentID string) error {
   435  
   436  	logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status)
   437  	newEval := eval.Copy()
   438  	newEval.Status = status
   439  	newEval.StatusDescription = desc
   440  	newEval.DeploymentID = deploymentID
   441  	newEval.FailedTGAllocs = tgMetrics
   442  	if nextEval != nil {
   443  		newEval.NextEval = nextEval.ID
   444  	}
   445  	if spawnedBlocked != nil {
   446  		newEval.BlockedEval = spawnedBlocked.ID
   447  	}
   448  	if queuedAllocs != nil {
   449  		newEval.QueuedAllocations = queuedAllocs
   450  	}
   451  
   452  	return planner.UpdateEval(newEval)
   453  }
   454  
   455  // inplaceUpdate attempts to update allocations in-place where possible. It
   456  // returns the allocs that couldn't be done inplace and then those that could.
   457  func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job,
   458  	stack Stack, updates []allocTuple) (destructive, inplace []allocTuple) {
   459  
   460  	// doInplace manipulates the updates map to make the current allocation
   461  	// an inplace update.
   462  	doInplace := func(cur, last, inplaceCount *int) {
   463  		updates[*cur], updates[*last-1] = updates[*last-1], updates[*cur]
   464  		*cur--
   465  		*last--
   466  		*inplaceCount++
   467  	}
   468  
   469  	ws := memdb.NewWatchSet()
   470  	n := len(updates)
   471  	inplaceCount := 0
   472  	for i := 0; i < n; i++ {
   473  		// Get the update
   474  		update := updates[i]
   475  
   476  		// Check if the task drivers or config has changed, requires
   477  		// a rolling upgrade since that cannot be done in-place.
   478  		existing := update.Alloc.Job
   479  		if tasksUpdated(job, existing, update.TaskGroup.Name) {
   480  			continue
   481  		}
   482  
   483  		// Terminal batch allocations are not filtered when they are completed
   484  		// successfully. We should avoid adding the allocation to the plan in
   485  		// the case that it is an in-place update to avoid both additional data
   486  		// in the plan and work for the clients.
   487  		if update.Alloc.TerminalStatus() {
   488  			doInplace(&i, &n, &inplaceCount)
   489  			continue
   490  		}
   491  
   492  		// Get the existing node
   493  		node, err := ctx.State().NodeByID(ws, update.Alloc.NodeID)
   494  		if err != nil {
   495  			ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v",
   496  				eval, update.Alloc.NodeID, err)
   497  			continue
   498  		}
   499  		if node == nil {
   500  			continue
   501  		}
   502  
   503  		// Set the existing node as the base set
   504  		stack.SetNodes([]*structs.Node{node})
   505  
   506  		// Stage an eviction of the current allocation. This is done so that
   507  		// the current allocation is discounted when checking for feasability.
   508  		// Otherwise we would be trying to fit the tasks current resources and
   509  		// updated resources. After select is called we can remove the evict.
   510  		ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop,
   511  			allocInPlace, "")
   512  
   513  		// Attempt to match the task group
   514  		option, _ := stack.Select(update.TaskGroup)
   515  
   516  		// Pop the allocation
   517  		ctx.Plan().PopUpdate(update.Alloc)
   518  
   519  		// Skip if we could not do an in-place update
   520  		if option == nil {
   521  			continue
   522  		}
   523  
   524  		// Restore the network offers from the existing allocation.
   525  		// We do not allow network resources (reserved/dynamic ports)
   526  		// to be updated. This is guarded in taskUpdated, so we can
   527  		// safely restore those here.
   528  		for task, resources := range option.TaskResources {
   529  			existing := update.Alloc.TaskResources[task]
   530  			resources.Networks = existing.Networks
   531  		}
   532  
   533  		// Create a shallow copy
   534  		newAlloc := new(structs.Allocation)
   535  		*newAlloc = *update.Alloc
   536  
   537  		// Update the allocation
   538  		newAlloc.EvalID = eval.ID
   539  		newAlloc.Job = nil       // Use the Job in the Plan
   540  		newAlloc.Resources = nil // Computed in Plan Apply
   541  		newAlloc.TaskResources = option.TaskResources
   542  		newAlloc.Metrics = ctx.Metrics()
   543  		ctx.Plan().AppendAlloc(newAlloc)
   544  
   545  		// Remove this allocation from the slice
   546  		doInplace(&i, &n, &inplaceCount)
   547  	}
   548  
   549  	if len(updates) > 0 {
   550  		ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplaceCount, len(updates))
   551  	}
   552  	return updates[:n], updates[n:]
   553  }
   554  
   555  // evictAndPlace is used to mark allocations for evicts and add them to the
   556  // placement queue. evictAndPlace modifies both the diffResult and the
   557  // limit. It returns true if the limit has been reached.
   558  func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool {
   559  	n := len(allocs)
   560  	for i := 0; i < n && i < *limit; i++ {
   561  		a := allocs[i]
   562  		ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc, "")
   563  		diff.place = append(diff.place, a)
   564  	}
   565  	if n <= *limit {
   566  		*limit -= n
   567  		return false
   568  	}
   569  	*limit = 0
   570  	return true
   571  }
   572  
   573  // markLostAndPlace is used to mark allocations as lost and add them to the
   574  // placement queue. evictAndPlace modifies both the diffResult and the
   575  // limit. It returns true if the limit has been reached.
   576  func markLostAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool {
   577  	n := len(allocs)
   578  	for i := 0; i < n && i < *limit; i++ {
   579  		a := allocs[i]
   580  		ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc, structs.AllocClientStatusLost)
   581  		diff.place = append(diff.place, a)
   582  	}
   583  	if n <= *limit {
   584  		*limit -= n
   585  		return false
   586  	}
   587  	*limit = 0
   588  	return true
   589  }
   590  
   591  // tgConstrainTuple is used to store the total constraints of a task group.
   592  type tgConstrainTuple struct {
   593  	// Holds the combined constraints of the task group and all it's sub-tasks.
   594  	constraints []*structs.Constraint
   595  
   596  	// The set of required drivers within the task group.
   597  	drivers map[string]struct{}
   598  
   599  	// The combined resources of all tasks within the task group.
   600  	size *structs.Resources
   601  }
   602  
   603  // taskGroupConstraints collects the constraints, drivers and resources required by each
   604  // sub-task to aggregate the TaskGroup totals
   605  func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple {
   606  	c := tgConstrainTuple{
   607  		constraints: make([]*structs.Constraint, 0, len(tg.Constraints)),
   608  		drivers:     make(map[string]struct{}),
   609  		size:        &structs.Resources{DiskMB: tg.EphemeralDisk.SizeMB},
   610  	}
   611  
   612  	c.constraints = append(c.constraints, tg.Constraints...)
   613  	for _, task := range tg.Tasks {
   614  		c.drivers[task.Driver] = struct{}{}
   615  		c.constraints = append(c.constraints, task.Constraints...)
   616  		c.size.Add(task.Resources)
   617  	}
   618  
   619  	return c
   620  }
   621  
   622  // desiredUpdates takes the diffResult as well as the set of inplace and
   623  // destructive updates and returns a map of task groups to their set of desired
   624  // updates.
   625  func desiredUpdates(diff *diffResult, inplaceUpdates,
   626  	destructiveUpdates []allocTuple) map[string]*structs.DesiredUpdates {
   627  	desiredTgs := make(map[string]*structs.DesiredUpdates)
   628  
   629  	for _, tuple := range diff.place {
   630  		name := tuple.TaskGroup.Name
   631  		des, ok := desiredTgs[name]
   632  		if !ok {
   633  			des = &structs.DesiredUpdates{}
   634  			desiredTgs[name] = des
   635  		}
   636  
   637  		des.Place++
   638  	}
   639  
   640  	for _, tuple := range diff.stop {
   641  		name := tuple.Alloc.TaskGroup
   642  		des, ok := desiredTgs[name]
   643  		if !ok {
   644  			des = &structs.DesiredUpdates{}
   645  			desiredTgs[name] = des
   646  		}
   647  
   648  		des.Stop++
   649  	}
   650  
   651  	for _, tuple := range diff.ignore {
   652  		name := tuple.TaskGroup.Name
   653  		des, ok := desiredTgs[name]
   654  		if !ok {
   655  			des = &structs.DesiredUpdates{}
   656  			desiredTgs[name] = des
   657  		}
   658  
   659  		des.Ignore++
   660  	}
   661  
   662  	for _, tuple := range diff.migrate {
   663  		name := tuple.TaskGroup.Name
   664  		des, ok := desiredTgs[name]
   665  		if !ok {
   666  			des = &structs.DesiredUpdates{}
   667  			desiredTgs[name] = des
   668  		}
   669  
   670  		des.Migrate++
   671  	}
   672  
   673  	for _, tuple := range inplaceUpdates {
   674  		name := tuple.TaskGroup.Name
   675  		des, ok := desiredTgs[name]
   676  		if !ok {
   677  			des = &structs.DesiredUpdates{}
   678  			desiredTgs[name] = des
   679  		}
   680  
   681  		des.InPlaceUpdate++
   682  	}
   683  
   684  	for _, tuple := range destructiveUpdates {
   685  		name := tuple.TaskGroup.Name
   686  		des, ok := desiredTgs[name]
   687  		if !ok {
   688  			des = &structs.DesiredUpdates{}
   689  			desiredTgs[name] = des
   690  		}
   691  
   692  		des.DestructiveUpdate++
   693  	}
   694  
   695  	return desiredTgs
   696  }
   697  
   698  // adjustQueuedAllocations decrements the number of allocations pending per task
   699  // group based on the number of allocations successfully placed
   700  func adjustQueuedAllocations(logger *log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) {
   701  	if result == nil {
   702  		return
   703  	}
   704  
   705  	for _, allocations := range result.NodeAllocation {
   706  		for _, allocation := range allocations {
   707  			// Ensure that the allocation is newly created. We check that
   708  			// the CreateIndex is equal to the ModifyIndex in order to check
   709  			// that the allocation was just created. We do not check that
   710  			// the CreateIndex is equal to the results AllocIndex because
   711  			// the allocations we get back have gone through the planner's
   712  			// optimistic snapshot and thus their indexes may not be
   713  			// correct, but they will be consistent.
   714  			if allocation.CreateIndex != allocation.ModifyIndex {
   715  				continue
   716  			}
   717  
   718  			if _, ok := queuedAllocs[allocation.TaskGroup]; ok {
   719  				queuedAllocs[allocation.TaskGroup] -= 1
   720  			} else {
   721  				logger.Printf("[ERR] sched: allocation %q placed but not in list of unplaced allocations", allocation.TaskGroup)
   722  			}
   723  		}
   724  	}
   725  }
   726  
   727  // updateNonTerminalAllocsToLost updates the allocations which are in pending/running state on tainted node
   728  // to lost
   729  func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*structs.Node, allocs []*structs.Allocation) {
   730  	for _, alloc := range allocs {
   731  		if _, ok := tainted[alloc.NodeID]; ok &&
   732  			alloc.DesiredStatus == structs.AllocDesiredStatusStop &&
   733  			(alloc.ClientStatus == structs.AllocClientStatusRunning ||
   734  				alloc.ClientStatus == structs.AllocClientStatusPending) {
   735  			plan.AppendUpdate(alloc, structs.AllocDesiredStatusStop, allocLost, structs.AllocClientStatusLost)
   736  		}
   737  	}
   738  }
   739  
   740  // genericAllocUpdateFn is a factory for the scheduler to create an allocUpdateType
   741  // function to be passed into the reconciler. The factory takes objects that
   742  // exist only in the scheduler context and returns a function that can be used
   743  // by the reconciler to make decsions about how to update an allocation. The
   744  // factory allows the reconciler to be unaware of how to determine the type of
   745  // update necessary and can minimize the set of objects it is exposed to.
   746  func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateType {
   747  	return func(existing *structs.Allocation, newJob *structs.Job, newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) {
   748  		// Same index, so nothing to do
   749  		if existing.Job.JobModifyIndex == newJob.JobModifyIndex {
   750  			return true, false, nil
   751  		}
   752  
   753  		// Check if the task drivers or config has changed, requires
   754  		// a destructive upgrade since that cannot be done in-place.
   755  		if tasksUpdated(newJob, existing.Job, newTG.Name) {
   756  			return false, true, nil
   757  		}
   758  
   759  		// Terminal batch allocations are not filtered when they are completed
   760  		// successfully. We should avoid adding the allocation to the plan in
   761  		// the case that it is an in-place update to avoid both additional data
   762  		// in the plan and work for the clients.
   763  		if existing.TerminalStatus() {
   764  			return true, false, nil
   765  		}
   766  
   767  		// Get the existing node
   768  		ws := memdb.NewWatchSet()
   769  		node, err := ctx.State().NodeByID(ws, existing.NodeID)
   770  		if err != nil {
   771  			ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", evalID, existing.NodeID, err)
   772  			return true, false, nil
   773  		}
   774  		if node == nil {
   775  			return false, true, nil
   776  		}
   777  
   778  		// Set the existing node as the base set
   779  		stack.SetNodes([]*structs.Node{node})
   780  
   781  		// Stage an eviction of the current allocation. This is done so that
   782  		// the current allocation is discounted when checking for feasability.
   783  		// Otherwise we would be trying to fit the tasks current resources and
   784  		// updated resources. After select is called we can remove the evict.
   785  		ctx.Plan().AppendUpdate(existing, structs.AllocDesiredStatusStop, allocInPlace, "")
   786  
   787  		// Attempt to match the task group
   788  		option, _ := stack.Select(newTG)
   789  
   790  		// Pop the allocation
   791  		ctx.Plan().PopUpdate(existing)
   792  
   793  		// Require destructive if we could not do an in-place update
   794  		if option == nil {
   795  			return false, true, nil
   796  		}
   797  
   798  		// Restore the network offers from the existing allocation.
   799  		// We do not allow network resources (reserved/dynamic ports)
   800  		// to be updated. This is guarded in taskUpdated, so we can
   801  		// safely restore those here.
   802  		for task, resources := range option.TaskResources {
   803  			existingResources := existing.TaskResources[task]
   804  			resources.Networks = existingResources.Networks
   805  		}
   806  
   807  		// Create a shallow copy
   808  		newAlloc := new(structs.Allocation)
   809  		*newAlloc = *existing
   810  
   811  		// Update the allocation
   812  		newAlloc.EvalID = evalID
   813  		newAlloc.Job = nil       // Use the Job in the Plan
   814  		newAlloc.Resources = nil // Computed in Plan Apply
   815  		newAlloc.TaskResources = option.TaskResources
   816  		newAlloc.Metrics = ctx.Metrics()
   817  		return false, false, newAlloc
   818  	}
   819  }