github.com/adityamillind98/nomad@v0.11.8/scheduler/util.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"math/rand"
     6  	"reflect"
     7  
     8  	log "github.com/hashicorp/go-hclog"
     9  	memdb "github.com/hashicorp/go-memdb"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  // allocTuple is a tuple of the allocation name and potential alloc ID
    14  type allocTuple struct {
    15  	Name      string
    16  	TaskGroup *structs.TaskGroup
    17  	Alloc     *structs.Allocation
    18  }
    19  
    20  // materializeTaskGroups is used to materialize all the task groups
    21  // a job requires. This is used to do the count expansion.
    22  func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup {
    23  	out := make(map[string]*structs.TaskGroup)
    24  	if job.Stopped() {
    25  		return out
    26  	}
    27  
    28  	for _, tg := range job.TaskGroups {
    29  		for i := 0; i < tg.Count; i++ {
    30  			name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i)
    31  			out[name] = tg
    32  		}
    33  	}
    34  	return out
    35  }
    36  
    37  // diffResult is used to return the sets that result from the diff
    38  type diffResult struct {
    39  	place, update, migrate, stop, ignore, lost []allocTuple
    40  }
    41  
    42  func (d *diffResult) GoString() string {
    43  	return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d) (lost %d)",
    44  		len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore), len(d.lost))
    45  }
    46  
    47  func (d *diffResult) Append(other *diffResult) {
    48  	d.place = append(d.place, other.place...)
    49  	d.update = append(d.update, other.update...)
    50  	d.migrate = append(d.migrate, other.migrate...)
    51  	d.stop = append(d.stop, other.stop...)
    52  	d.ignore = append(d.ignore, other.ignore...)
    53  	d.lost = append(d.lost, other.lost...)
    54  }
    55  
    56  // diffSystemAllocsForNode is used to do a set difference between the target allocations
    57  // and the existing allocations for a particular node. This returns 6 sets of results,
    58  // the list of named task groups that need to be placed (no existing allocation), the
    59  // allocations that need to be updated (job definition is newer), allocs that
    60  // need to be migrated (node is draining), the allocs that need to be evicted
    61  // (no longer required), those that should be ignored and those that are lost
    62  // that need to be replaced (running on a lost node).
    63  //
    64  // job is the job whose allocs is going to be diff-ed.
    65  // taintedNodes is an index of the nodes which are either down or in drain mode
    66  // by name.
    67  // required is a set of allocations that must exist.
    68  // allocs is a list of non terminal allocations.
    69  // terminalAllocs is an index of the latest terminal allocations by name.
    70  func diffSystemAllocsForNode(job *structs.Job, nodeID string,
    71  	eligibleNodes, taintedNodes map[string]*structs.Node,
    72  	required map[string]*structs.TaskGroup, allocs []*structs.Allocation,
    73  	terminalAllocs map[string]*structs.Allocation) *diffResult {
    74  	result := &diffResult{}
    75  
    76  	// Scan the existing updates
    77  	existing := make(map[string]struct{})
    78  	for _, exist := range allocs {
    79  		// Index the existing node
    80  		name := exist.Name
    81  		existing[name] = struct{}{}
    82  
    83  		// Check for the definition in the required set
    84  		tg, ok := required[name]
    85  
    86  		// If not required, we stop the alloc
    87  		if !ok {
    88  			result.stop = append(result.stop, allocTuple{
    89  				Name:      name,
    90  				TaskGroup: tg,
    91  				Alloc:     exist,
    92  			})
    93  			continue
    94  		}
    95  
    96  		// If we have been marked for migration and aren't terminal, migrate
    97  		if !exist.TerminalStatus() && exist.DesiredTransition.ShouldMigrate() {
    98  			result.migrate = append(result.migrate, allocTuple{
    99  				Name:      name,
   100  				TaskGroup: tg,
   101  				Alloc:     exist,
   102  			})
   103  			continue
   104  		}
   105  		// If we are on a tainted node, we must migrate if we are a service or
   106  		// if the batch allocation did not finish
   107  		if node, ok := taintedNodes[exist.NodeID]; ok {
   108  			// If the job is batch and finished successfully, the fact that the
   109  			// node is tainted does not mean it should be migrated or marked as
   110  			// lost as the work was already successfully finished. However for
   111  			// service/system jobs, tasks should never complete. The check of
   112  			// batch type, defends against client bugs.
   113  			if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() {
   114  				goto IGNORE
   115  			}
   116  
   117  			if !exist.TerminalStatus() && (node == nil || node.TerminalStatus()) {
   118  				result.lost = append(result.lost, allocTuple{
   119  					Name:      name,
   120  					TaskGroup: tg,
   121  					Alloc:     exist,
   122  				})
   123  			} else {
   124  				goto IGNORE
   125  			}
   126  
   127  			continue
   128  		}
   129  
   130  		// For an existing allocation, if the nodeID is no longer
   131  		// eligible, the diff should be ignored
   132  		if _, ok := eligibleNodes[nodeID]; !ok {
   133  			goto IGNORE
   134  		}
   135  
   136  		// If the definition is updated we need to update
   137  		if job.JobModifyIndex != exist.Job.JobModifyIndex {
   138  			result.update = append(result.update, allocTuple{
   139  				Name:      name,
   140  				TaskGroup: tg,
   141  				Alloc:     exist,
   142  			})
   143  			continue
   144  		}
   145  
   146  		// Everything is up-to-date
   147  	IGNORE:
   148  		result.ignore = append(result.ignore, allocTuple{
   149  			Name:      name,
   150  			TaskGroup: tg,
   151  			Alloc:     exist,
   152  		})
   153  	}
   154  
   155  	// Scan the required groups
   156  	for name, tg := range required {
   157  		// Check for an existing allocation
   158  		_, ok := existing[name]
   159  
   160  		// Require a placement if no existing allocation. If there
   161  		// is an existing allocation, we would have checked for a potential
   162  		// update or ignore above. Ignore placements for tainted or
   163  		// ineligible nodes
   164  		if !ok {
   165  			// Tainted and ineligible nodes for a non existing alloc
   166  			// should be filtered out and not count towards ignore or place
   167  			if _, tainted := taintedNodes[nodeID]; tainted {
   168  				continue
   169  			}
   170  			if _, eligible := eligibleNodes[nodeID]; !eligible {
   171  				continue
   172  			}
   173  
   174  			allocTuple := allocTuple{
   175  				Name:      name,
   176  				TaskGroup: tg,
   177  				Alloc:     terminalAllocs[name],
   178  			}
   179  
   180  			// If the new allocation isn't annotated with a previous allocation
   181  			// or if the previous allocation isn't from the same node then we
   182  			// annotate the allocTuple with a new Allocation
   183  			if allocTuple.Alloc == nil || allocTuple.Alloc.NodeID != nodeID {
   184  				allocTuple.Alloc = &structs.Allocation{NodeID: nodeID}
   185  			}
   186  			result.place = append(result.place, allocTuple)
   187  		}
   188  	}
   189  	return result
   190  }
   191  
   192  // diffSystemAllocs is like diffSystemAllocsForNode however, the allocations in the
   193  // diffResult contain the specific nodeID they should be allocated on.
   194  //
   195  // job is the job whose allocs is going to be diff-ed.
   196  // nodes is a list of nodes in ready state.
   197  // taintedNodes is an index of the nodes which are either down or in drain mode
   198  // by name.
   199  // allocs is a list of non terminal allocations.
   200  // terminalAllocs is an index of the latest terminal allocations by name.
   201  func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]*structs.Node,
   202  	allocs []*structs.Allocation, terminalAllocs map[string]*structs.Allocation) *diffResult {
   203  
   204  	// Build a mapping of nodes to all their allocs.
   205  	nodeAllocs := make(map[string][]*structs.Allocation, len(allocs))
   206  	for _, alloc := range allocs {
   207  		nallocs := append(nodeAllocs[alloc.NodeID], alloc)
   208  		nodeAllocs[alloc.NodeID] = nallocs
   209  	}
   210  
   211  	eligibleNodes := make(map[string]*structs.Node)
   212  	for _, node := range nodes {
   213  		if _, ok := nodeAllocs[node.ID]; !ok {
   214  			nodeAllocs[node.ID] = nil
   215  		}
   216  		eligibleNodes[node.ID] = node
   217  	}
   218  
   219  	// Create the required task groups.
   220  	required := materializeTaskGroups(job)
   221  
   222  	result := &diffResult{}
   223  	for nodeID, allocs := range nodeAllocs {
   224  		diff := diffSystemAllocsForNode(job, nodeID, eligibleNodes, taintedNodes, required, allocs, terminalAllocs)
   225  		result.Append(diff)
   226  	}
   227  
   228  	return result
   229  }
   230  
   231  // readyNodesInDCs returns all the ready nodes in the given datacenters and a
   232  // mapping of each data center to the count of ready nodes.
   233  func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) {
   234  	// Index the DCs
   235  	dcMap := make(map[string]int, len(dcs))
   236  	for _, dc := range dcs {
   237  		dcMap[dc] = 0
   238  	}
   239  
   240  	// Scan the nodes
   241  	ws := memdb.NewWatchSet()
   242  	var out []*structs.Node
   243  	iter, err := state.Nodes(ws)
   244  	if err != nil {
   245  		return nil, nil, err
   246  	}
   247  	for {
   248  		raw := iter.Next()
   249  		if raw == nil {
   250  			break
   251  		}
   252  
   253  		// Filter on datacenter and status
   254  		node := raw.(*structs.Node)
   255  		if node.Status != structs.NodeStatusReady {
   256  			continue
   257  		}
   258  		if node.Drain {
   259  			continue
   260  		}
   261  		if node.SchedulingEligibility != structs.NodeSchedulingEligible {
   262  			continue
   263  		}
   264  		if _, ok := dcMap[node.Datacenter]; !ok {
   265  			continue
   266  		}
   267  		out = append(out, node)
   268  		dcMap[node.Datacenter]++
   269  	}
   270  	return out, dcMap, nil
   271  }
   272  
   273  // retryMax is used to retry a callback until it returns success or
   274  // a maximum number of attempts is reached. An optional reset function may be
   275  // passed which is called after each failed iteration. If the reset function is
   276  // set and returns true, the number of attempts is reset back to max.
   277  func retryMax(max int, cb func() (bool, error), reset func() bool) error {
   278  	attempts := 0
   279  	for attempts < max {
   280  		done, err := cb()
   281  		if err != nil {
   282  			return err
   283  		}
   284  		if done {
   285  			return nil
   286  		}
   287  
   288  		// Check if we should reset the number attempts
   289  		if reset != nil && reset() {
   290  			attempts = 0
   291  		} else {
   292  			attempts++
   293  		}
   294  	}
   295  	return &SetStatusError{
   296  		Err:        fmt.Errorf("maximum attempts reached (%d)", max),
   297  		EvalStatus: structs.EvalStatusFailed,
   298  	}
   299  }
   300  
   301  // progressMade checks to see if the plan result made allocations or updates.
   302  // If the result is nil, false is returned.
   303  func progressMade(result *structs.PlanResult) bool {
   304  	return result != nil && (len(result.NodeUpdate) != 0 ||
   305  		len(result.NodeAllocation) != 0 || result.Deployment != nil ||
   306  		len(result.DeploymentUpdates) != 0)
   307  }
   308  
   309  // taintedNodes is used to scan the allocations and then check if the
   310  // underlying nodes are tainted, and should force a migration of the allocation.
   311  // All the nodes returned in the map are tainted.
   312  func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*structs.Node, error) {
   313  	out := make(map[string]*structs.Node)
   314  	for _, alloc := range allocs {
   315  		if _, ok := out[alloc.NodeID]; ok {
   316  			continue
   317  		}
   318  
   319  		ws := memdb.NewWatchSet()
   320  		node, err := state.NodeByID(ws, alloc.NodeID)
   321  		if err != nil {
   322  			return nil, err
   323  		}
   324  
   325  		// If the node does not exist, we should migrate
   326  		if node == nil {
   327  			out[alloc.NodeID] = nil
   328  			continue
   329  		}
   330  		if structs.ShouldDrainNode(node.Status) || node.Drain {
   331  			out[alloc.NodeID] = node
   332  		}
   333  	}
   334  	return out, nil
   335  }
   336  
   337  // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm
   338  func shuffleNodes(nodes []*structs.Node) {
   339  	n := len(nodes)
   340  	for i := n - 1; i > 0; i-- {
   341  		j := rand.Intn(i + 1)
   342  		nodes[i], nodes[j] = nodes[j], nodes[i]
   343  	}
   344  }
   345  
   346  // tasksUpdated does a diff between task groups to see if the
   347  // tasks, their drivers, environment variables or config have updated. The
   348  // inputs are the task group name to diff and two jobs to diff.
   349  // taskUpdated and functions called within assume that the given
   350  // taskGroup has already been checked to not be nil
   351  func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
   352  	a := jobA.LookupTaskGroup(taskGroup)
   353  	b := jobB.LookupTaskGroup(taskGroup)
   354  
   355  	// If the number of tasks do not match, clearly there is an update
   356  	if len(a.Tasks) != len(b.Tasks) {
   357  		return true
   358  	}
   359  
   360  	// Check ephemeral disk
   361  	if !reflect.DeepEqual(a.EphemeralDisk, b.EphemeralDisk) {
   362  		return true
   363  	}
   364  
   365  	// Check that the network resources haven't changed
   366  	if networkUpdated(a.Networks, b.Networks) {
   367  		return true
   368  	}
   369  
   370  	// Check Affinities
   371  	if affinitiesUpdated(jobA, jobB, taskGroup) {
   372  		return true
   373  	}
   374  
   375  	// Check Spreads
   376  	if spreadsUpdated(jobA, jobB, taskGroup) {
   377  		return true
   378  	}
   379  
   380  	// Check each task
   381  	for _, at := range a.Tasks {
   382  		bt := b.LookupTask(at.Name)
   383  		if bt == nil {
   384  			return true
   385  		}
   386  		if at.Driver != bt.Driver {
   387  			return true
   388  		}
   389  		if at.User != bt.User {
   390  			return true
   391  		}
   392  		if !reflect.DeepEqual(at.Config, bt.Config) {
   393  			return true
   394  		}
   395  		if !reflect.DeepEqual(at.Env, bt.Env) {
   396  			return true
   397  		}
   398  		if !reflect.DeepEqual(at.Artifacts, bt.Artifacts) {
   399  			return true
   400  		}
   401  		if !reflect.DeepEqual(at.Vault, bt.Vault) {
   402  			return true
   403  		}
   404  		if !reflect.DeepEqual(at.Templates, bt.Templates) {
   405  			return true
   406  		}
   407  
   408  		// Check the metadata
   409  		if !reflect.DeepEqual(
   410  			jobA.CombinedTaskMeta(taskGroup, at.Name),
   411  			jobB.CombinedTaskMeta(taskGroup, bt.Name)) {
   412  			return true
   413  		}
   414  
   415  		// Inspect the network to see if the dynamic ports are different
   416  		if networkUpdated(at.Resources.Networks, bt.Resources.Networks) {
   417  			return true
   418  		}
   419  
   420  		// Inspect the non-network resources
   421  		if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU {
   422  			return true
   423  		} else if ar.MemoryMB != br.MemoryMB {
   424  			return true
   425  		} else if !ar.Devices.Equals(&br.Devices) {
   426  			return true
   427  		}
   428  	}
   429  	return false
   430  }
   431  
   432  func networkUpdated(netA, netB []*structs.NetworkResource) bool {
   433  	if len(netA) != len(netB) {
   434  		return true
   435  	}
   436  	for idx := range netA {
   437  		an := netA[idx]
   438  		bn := netB[idx]
   439  
   440  		if an.Mode != bn.Mode {
   441  			return true
   442  		}
   443  
   444  		if an.MBits != bn.MBits {
   445  			return true
   446  		}
   447  
   448  		aPorts, bPorts := networkPortMap(an), networkPortMap(bn)
   449  		if !reflect.DeepEqual(aPorts, bPorts) {
   450  			return true
   451  		}
   452  	}
   453  	return false
   454  }
   455  
   456  // networkPortMap takes a network resource and returns a map of port labels to
   457  // values. The value for dynamic ports is disregarded even if it is set. This
   458  // makes this function suitable for comparing two network resources for changes.
   459  func networkPortMap(n *structs.NetworkResource) map[string]int {
   460  	m := make(map[string]int, len(n.DynamicPorts)+len(n.ReservedPorts))
   461  	for _, p := range n.ReservedPorts {
   462  		m[p.Label] = p.Value
   463  	}
   464  	for _, p := range n.DynamicPorts {
   465  		m[p.Label] = -1
   466  	}
   467  	return m
   468  }
   469  
   470  func affinitiesUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
   471  	var aAffinities []*structs.Affinity
   472  	var bAffinities []*structs.Affinity
   473  
   474  	tgA := jobA.LookupTaskGroup(taskGroup)
   475  	tgB := jobB.LookupTaskGroup(taskGroup)
   476  
   477  	// Append jobA job and task group level affinities
   478  	aAffinities = append(aAffinities, jobA.Affinities...)
   479  	aAffinities = append(aAffinities, tgA.Affinities...)
   480  
   481  	// Append jobB job and task group level affinities
   482  	bAffinities = append(bAffinities, jobB.Affinities...)
   483  	bAffinities = append(bAffinities, tgB.Affinities...)
   484  
   485  	// append task affinities
   486  	for _, task := range tgA.Tasks {
   487  		aAffinities = append(aAffinities, task.Affinities...)
   488  	}
   489  
   490  	for _, task := range tgB.Tasks {
   491  		bAffinities = append(bAffinities, task.Affinities...)
   492  	}
   493  
   494  	// Check for equality
   495  	if len(aAffinities) != len(bAffinities) {
   496  		return true
   497  	}
   498  
   499  	return !reflect.DeepEqual(aAffinities, bAffinities)
   500  }
   501  
   502  func spreadsUpdated(jobA, jobB *structs.Job, taskGroup string) bool {
   503  	var aSpreads []*structs.Spread
   504  	var bSpreads []*structs.Spread
   505  
   506  	tgA := jobA.LookupTaskGroup(taskGroup)
   507  	tgB := jobB.LookupTaskGroup(taskGroup)
   508  
   509  	// append jobA and task group level spreads
   510  	aSpreads = append(aSpreads, jobA.Spreads...)
   511  	aSpreads = append(aSpreads, tgA.Spreads...)
   512  
   513  	// append jobB and task group level spreads
   514  	bSpreads = append(bSpreads, jobB.Spreads...)
   515  	bSpreads = append(bSpreads, tgB.Spreads...)
   516  
   517  	// Check for equality
   518  	if len(aSpreads) != len(bSpreads) {
   519  		return true
   520  	}
   521  
   522  	return !reflect.DeepEqual(aSpreads, bSpreads)
   523  }
   524  
   525  // setStatus is used to update the status of the evaluation
   526  func setStatus(logger log.Logger, planner Planner,
   527  	eval, nextEval, spawnedBlocked *structs.Evaluation,
   528  	tgMetrics map[string]*structs.AllocMetric, status, desc string,
   529  	queuedAllocs map[string]int, deploymentID string) error {
   530  
   531  	logger.Debug("setting eval status", "status", status)
   532  	newEval := eval.Copy()
   533  	newEval.Status = status
   534  	newEval.StatusDescription = desc
   535  	newEval.DeploymentID = deploymentID
   536  	newEval.FailedTGAllocs = tgMetrics
   537  	if nextEval != nil {
   538  		newEval.NextEval = nextEval.ID
   539  	}
   540  	if spawnedBlocked != nil {
   541  		newEval.BlockedEval = spawnedBlocked.ID
   542  	}
   543  	if queuedAllocs != nil {
   544  		newEval.QueuedAllocations = queuedAllocs
   545  	}
   546  
   547  	return planner.UpdateEval(newEval)
   548  }
   549  
   550  // inplaceUpdate attempts to update allocations in-place where possible. It
   551  // returns the allocs that couldn't be done inplace and then those that could.
   552  func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job,
   553  	stack Stack, updates []allocTuple) (destructive, inplace []allocTuple) {
   554  
   555  	// doInplace manipulates the updates map to make the current allocation
   556  	// an inplace update.
   557  	doInplace := func(cur, last, inplaceCount *int) {
   558  		updates[*cur], updates[*last-1] = updates[*last-1], updates[*cur]
   559  		*cur--
   560  		*last--
   561  		*inplaceCount++
   562  	}
   563  
   564  	ws := memdb.NewWatchSet()
   565  	n := len(updates)
   566  	inplaceCount := 0
   567  	for i := 0; i < n; i++ {
   568  		// Get the update
   569  		update := updates[i]
   570  
   571  		// Check if the task drivers or config has changed, requires
   572  		// a rolling upgrade since that cannot be done in-place.
   573  		existing := update.Alloc.Job
   574  		if tasksUpdated(job, existing, update.TaskGroup.Name) {
   575  			continue
   576  		}
   577  
   578  		// Terminal batch allocations are not filtered when they are completed
   579  		// successfully. We should avoid adding the allocation to the plan in
   580  		// the case that it is an in-place update to avoid both additional data
   581  		// in the plan and work for the clients.
   582  		if update.Alloc.TerminalStatus() {
   583  			doInplace(&i, &n, &inplaceCount)
   584  			continue
   585  		}
   586  
   587  		// Get the existing node
   588  		node, err := ctx.State().NodeByID(ws, update.Alloc.NodeID)
   589  		if err != nil {
   590  			ctx.Logger().Error("failed to get node", "node_id", update.Alloc.NodeID, "error", err)
   591  			continue
   592  		}
   593  		if node == nil {
   594  			continue
   595  		}
   596  
   597  		// Set the existing node as the base set
   598  		stack.SetNodes([]*structs.Node{node})
   599  
   600  		// Stage an eviction of the current allocation. This is done so that
   601  		// the current allocation is discounted when checking for feasibility.
   602  		// Otherwise we would be trying to fit the tasks current resources and
   603  		// updated resources. After select is called we can remove the evict.
   604  		ctx.Plan().AppendStoppedAlloc(update.Alloc, allocInPlace, "", "")
   605  
   606  		// Attempt to match the task group
   607  		option := stack.Select(update.TaskGroup, nil) // This select only looks at one node so we don't pass selectOptions
   608  
   609  		// Pop the allocation
   610  		ctx.Plan().PopUpdate(update.Alloc)
   611  
   612  		// Skip if we could not do an in-place update
   613  		if option == nil {
   614  			continue
   615  		}
   616  
   617  		// Restore the network and device offers from the existing allocation.
   618  		// We do not allow network resources (reserved/dynamic ports)
   619  		// to be updated. This is guarded in taskUpdated, so we can
   620  		// safely restore those here.
   621  		for task, resources := range option.TaskResources {
   622  			var networks structs.Networks
   623  			var devices []*structs.AllocatedDeviceResource
   624  			if update.Alloc.AllocatedResources != nil {
   625  				if tr, ok := update.Alloc.AllocatedResources.Tasks[task]; ok {
   626  					networks = tr.Networks
   627  					devices = tr.Devices
   628  				}
   629  			} else if tr, ok := update.Alloc.TaskResources[task]; ok {
   630  				networks = tr.Networks
   631  			}
   632  
   633  			// Add the networks and devices back
   634  			resources.Networks = networks
   635  			resources.Devices = devices
   636  		}
   637  
   638  		// Create a shallow copy
   639  		newAlloc := new(structs.Allocation)
   640  		*newAlloc = *update.Alloc
   641  
   642  		// Update the allocation
   643  		newAlloc.EvalID = eval.ID
   644  		newAlloc.Job = nil       // Use the Job in the Plan
   645  		newAlloc.Resources = nil // Computed in Plan Apply
   646  		newAlloc.AllocatedResources = &structs.AllocatedResources{
   647  			Tasks:          option.TaskResources,
   648  			TaskLifecycles: option.TaskLifecycles,
   649  			Shared: structs.AllocatedSharedResources{
   650  				DiskMB: int64(update.TaskGroup.EphemeralDisk.SizeMB),
   651  			},
   652  		}
   653  		newAlloc.Metrics = ctx.Metrics()
   654  		ctx.Plan().AppendAlloc(newAlloc)
   655  
   656  		// Remove this allocation from the slice
   657  		doInplace(&i, &n, &inplaceCount)
   658  	}
   659  
   660  	if len(updates) > 0 {
   661  		ctx.Logger().Debug("made in-place updates", "in-place", inplaceCount, "total_updates", len(updates))
   662  	}
   663  	return updates[:n], updates[n:]
   664  }
   665  
   666  // evictAndPlace is used to mark allocations for evicts and add them to the
   667  // placement queue. evictAndPlace modifies both the diffResult and the
   668  // limit. It returns true if the limit has been reached.
   669  func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool {
   670  	n := len(allocs)
   671  	for i := 0; i < n && i < *limit; i++ {
   672  		a := allocs[i]
   673  		ctx.Plan().AppendStoppedAlloc(a.Alloc, desc, "", "")
   674  		diff.place = append(diff.place, a)
   675  	}
   676  	if n <= *limit {
   677  		*limit -= n
   678  		return false
   679  	}
   680  	*limit = 0
   681  	return true
   682  }
   683  
   684  // tgConstrainTuple is used to store the total constraints of a task group.
   685  type tgConstrainTuple struct {
   686  	// Holds the combined constraints of the task group and all it's sub-tasks.
   687  	constraints []*structs.Constraint
   688  
   689  	// The set of required drivers within the task group.
   690  	drivers map[string]struct{}
   691  }
   692  
   693  // taskGroupConstraints collects the constraints, drivers and resources required by each
   694  // sub-task to aggregate the TaskGroup totals
   695  func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple {
   696  	c := tgConstrainTuple{
   697  		constraints: make([]*structs.Constraint, 0, len(tg.Constraints)),
   698  		drivers:     make(map[string]struct{}),
   699  	}
   700  
   701  	c.constraints = append(c.constraints, tg.Constraints...)
   702  	for _, task := range tg.Tasks {
   703  		c.drivers[task.Driver] = struct{}{}
   704  		c.constraints = append(c.constraints, task.Constraints...)
   705  	}
   706  
   707  	return c
   708  }
   709  
   710  // desiredUpdates takes the diffResult as well as the set of inplace and
   711  // destructive updates and returns a map of task groups to their set of desired
   712  // updates.
   713  func desiredUpdates(diff *diffResult, inplaceUpdates,
   714  	destructiveUpdates []allocTuple) map[string]*structs.DesiredUpdates {
   715  	desiredTgs := make(map[string]*structs.DesiredUpdates)
   716  
   717  	for _, tuple := range diff.place {
   718  		name := tuple.TaskGroup.Name
   719  		des, ok := desiredTgs[name]
   720  		if !ok {
   721  			des = &structs.DesiredUpdates{}
   722  			desiredTgs[name] = des
   723  		}
   724  
   725  		des.Place++
   726  	}
   727  
   728  	for _, tuple := range diff.stop {
   729  		name := tuple.Alloc.TaskGroup
   730  		des, ok := desiredTgs[name]
   731  		if !ok {
   732  			des = &structs.DesiredUpdates{}
   733  			desiredTgs[name] = des
   734  		}
   735  
   736  		des.Stop++
   737  	}
   738  
   739  	for _, tuple := range diff.ignore {
   740  		name := tuple.TaskGroup.Name
   741  		des, ok := desiredTgs[name]
   742  		if !ok {
   743  			des = &structs.DesiredUpdates{}
   744  			desiredTgs[name] = des
   745  		}
   746  
   747  		des.Ignore++
   748  	}
   749  
   750  	for _, tuple := range diff.migrate {
   751  		name := tuple.TaskGroup.Name
   752  		des, ok := desiredTgs[name]
   753  		if !ok {
   754  			des = &structs.DesiredUpdates{}
   755  			desiredTgs[name] = des
   756  		}
   757  
   758  		des.Migrate++
   759  	}
   760  
   761  	for _, tuple := range inplaceUpdates {
   762  		name := tuple.TaskGroup.Name
   763  		des, ok := desiredTgs[name]
   764  		if !ok {
   765  			des = &structs.DesiredUpdates{}
   766  			desiredTgs[name] = des
   767  		}
   768  
   769  		des.InPlaceUpdate++
   770  	}
   771  
   772  	for _, tuple := range destructiveUpdates {
   773  		name := tuple.TaskGroup.Name
   774  		des, ok := desiredTgs[name]
   775  		if !ok {
   776  			des = &structs.DesiredUpdates{}
   777  			desiredTgs[name] = des
   778  		}
   779  
   780  		des.DestructiveUpdate++
   781  	}
   782  
   783  	return desiredTgs
   784  }
   785  
   786  // adjustQueuedAllocations decrements the number of allocations pending per task
   787  // group based on the number of allocations successfully placed
   788  func adjustQueuedAllocations(logger log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) {
   789  	if result == nil {
   790  		return
   791  	}
   792  
   793  	for _, allocations := range result.NodeAllocation {
   794  		for _, allocation := range allocations {
   795  			// Ensure that the allocation is newly created. We check that
   796  			// the CreateIndex is equal to the ModifyIndex in order to check
   797  			// that the allocation was just created. We do not check that
   798  			// the CreateIndex is equal to the results AllocIndex because
   799  			// the allocations we get back have gone through the planner's
   800  			// optimistic snapshot and thus their indexes may not be
   801  			// correct, but they will be consistent.
   802  			if allocation.CreateIndex != allocation.ModifyIndex {
   803  				continue
   804  			}
   805  
   806  			if _, ok := queuedAllocs[allocation.TaskGroup]; ok {
   807  				queuedAllocs[allocation.TaskGroup]--
   808  			} else {
   809  				logger.Error("allocation placed but task group is not in list of unplaced allocations", "task_group", allocation.TaskGroup)
   810  			}
   811  		}
   812  	}
   813  }
   814  
   815  // updateNonTerminalAllocsToLost updates the allocations which are in pending/running state
   816  // on tainted node to lost, but only for allocs already DesiredStatus stop or evict
   817  func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*structs.Node, allocs []*structs.Allocation) {
   818  	for _, alloc := range allocs {
   819  		node, ok := tainted[alloc.NodeID]
   820  		if !ok {
   821  			continue
   822  		}
   823  
   824  		// Only handle down nodes or nodes that are gone (node == nil)
   825  		if node != nil && node.Status != structs.NodeStatusDown {
   826  			continue
   827  		}
   828  
   829  		// If the alloc is already correctly marked lost, we're done
   830  		if (alloc.DesiredStatus == structs.AllocDesiredStatusStop ||
   831  			alloc.DesiredStatus == structs.AllocDesiredStatusEvict) &&
   832  			(alloc.ClientStatus == structs.AllocClientStatusRunning ||
   833  				alloc.ClientStatus == structs.AllocClientStatusPending) {
   834  			plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost, "")
   835  		}
   836  	}
   837  }
   838  
   839  // genericAllocUpdateFn is a factory for the scheduler to create an allocUpdateType
   840  // function to be passed into the reconciler. The factory takes objects that
   841  // exist only in the scheduler context and returns a function that can be used
   842  // by the reconciler to make decisions about how to update an allocation. The
   843  // factory allows the reconciler to be unaware of how to determine the type of
   844  // update necessary and can minimize the set of objects it is exposed to.
   845  func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateType {
   846  	return func(existing *structs.Allocation, newJob *structs.Job, newTG *structs.TaskGroup) (ignore, destructive bool, updated *structs.Allocation) {
   847  		// Same index, so nothing to do
   848  		if existing.Job.JobModifyIndex == newJob.JobModifyIndex {
   849  			return true, false, nil
   850  		}
   851  
   852  		// Check if the task drivers or config has changed, requires
   853  		// a destructive upgrade since that cannot be done in-place.
   854  		if tasksUpdated(newJob, existing.Job, newTG.Name) {
   855  			return false, true, nil
   856  		}
   857  
   858  		// Terminal batch allocations are not filtered when they are completed
   859  		// successfully. We should avoid adding the allocation to the plan in
   860  		// the case that it is an in-place update to avoid both additional data
   861  		// in the plan and work for the clients.
   862  		if existing.TerminalStatus() {
   863  			return true, false, nil
   864  		}
   865  
   866  		// Get the existing node
   867  		ws := memdb.NewWatchSet()
   868  		node, err := ctx.State().NodeByID(ws, existing.NodeID)
   869  		if err != nil {
   870  			ctx.Logger().Error("failed to get node", "node_id", existing.NodeID, "error", err)
   871  			return true, false, nil
   872  		}
   873  		if node == nil {
   874  			return false, true, nil
   875  		}
   876  
   877  		// Set the existing node as the base set
   878  		stack.SetNodes([]*structs.Node{node})
   879  
   880  		// Stage an eviction of the current allocation. This is done so that
   881  		// the current allocation is discounted when checking for feasibility.
   882  		// Otherwise we would be trying to fit the tasks current resources and
   883  		// updated resources. After select is called we can remove the evict.
   884  		ctx.Plan().AppendStoppedAlloc(existing, allocInPlace, "", "")
   885  
   886  		// Attempt to match the task group
   887  		option := stack.Select(newTG, nil) // This select only looks at one node so we don't pass selectOptions
   888  
   889  		// Pop the allocation
   890  		ctx.Plan().PopUpdate(existing)
   891  
   892  		// Require destructive if we could not do an in-place update
   893  		if option == nil {
   894  			return false, true, nil
   895  		}
   896  
   897  		// Restore the network and device offers from the existing allocation.
   898  		// We do not allow network resources (reserved/dynamic ports)
   899  		// to be updated. This is guarded in taskUpdated, so we can
   900  		// safely restore those here.
   901  		for task, resources := range option.TaskResources {
   902  			var networks structs.Networks
   903  			var devices []*structs.AllocatedDeviceResource
   904  			if existing.AllocatedResources != nil {
   905  				if tr, ok := existing.AllocatedResources.Tasks[task]; ok {
   906  					networks = tr.Networks
   907  					devices = tr.Devices
   908  				}
   909  			} else if tr, ok := existing.TaskResources[task]; ok {
   910  				networks = tr.Networks
   911  			}
   912  
   913  			// Add the networks back
   914  			resources.Networks = networks
   915  			resources.Devices = devices
   916  		}
   917  
   918  		// Create a shallow copy
   919  		newAlloc := new(structs.Allocation)
   920  		*newAlloc = *existing
   921  
   922  		// Update the allocation
   923  		newAlloc.EvalID = evalID
   924  		newAlloc.Job = nil       // Use the Job in the Plan
   925  		newAlloc.Resources = nil // Computed in Plan Apply
   926  		newAlloc.AllocatedResources = &structs.AllocatedResources{
   927  			Tasks:          option.TaskResources,
   928  			TaskLifecycles: option.TaskLifecycles,
   929  			Shared: structs.AllocatedSharedResources{
   930  				DiskMB: int64(newTG.EphemeralDisk.SizeMB),
   931  			},
   932  		}
   933  
   934  		// Since this is an inplace update, we should copy network
   935  		// information from the original alloc. This is similar to how
   936  		// we copy network info for task level networks above.
   937  		//
   938  		// existing.AllocatedResources is nil on Allocations created by
   939  		// Nomad v0.8 or earlier.
   940  		if existing.AllocatedResources != nil {
   941  			newAlloc.AllocatedResources.Shared.Networks = existing.AllocatedResources.Shared.Networks
   942  		}
   943  
   944  		// Use metrics from existing alloc for in place upgrade
   945  		// This is because if the inplace upgrade succeeded, any scoring metadata from
   946  		// when it first went through the scheduler should still be preserved. Using scoring
   947  		// metadata from the context would incorrectly replace it with metadata only from a single node that the
   948  		// allocation is already on.
   949  		newAlloc.Metrics = existing.Metrics.Copy()
   950  		return false, false, newAlloc
   951  	}
   952  }