github.com/dkerwin/nomad@v0.3.3-0.20160525181927-74554135514b/scheduler/util.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"math/rand"
     7  	"reflect"
     8  
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  )
    11  
    12  // allocTuple is a tuple of the allocation name and potential alloc ID
    13  type allocTuple struct {
    14  	Name      string
    15  	TaskGroup *structs.TaskGroup
    16  	Alloc     *structs.Allocation
    17  }
    18  
    19  // materializeTaskGroups is used to materialize all the task groups
    20  // a job requires. This is used to do the count expansion.
    21  func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup {
    22  	out := make(map[string]*structs.TaskGroup)
    23  	if job == nil {
    24  		return out
    25  	}
    26  
    27  	for _, tg := range job.TaskGroups {
    28  		for i := 0; i < tg.Count; i++ {
    29  			name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i)
    30  			out[name] = tg
    31  		}
    32  	}
    33  	return out
    34  }
    35  
    36  // diffResult is used to return the sets that result from the diff
    37  type diffResult struct {
    38  	place, update, migrate, stop, ignore []allocTuple
    39  }
    40  
    41  func (d *diffResult) GoString() string {
    42  	return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d)",
    43  		len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore))
    44  }
    45  
    46  func (d *diffResult) Append(other *diffResult) {
    47  	d.place = append(d.place, other.place...)
    48  	d.update = append(d.update, other.update...)
    49  	d.migrate = append(d.migrate, other.migrate...)
    50  	d.stop = append(d.stop, other.stop...)
    51  	d.ignore = append(d.ignore, other.ignore...)
    52  }
    53  
    54  // diffAllocs is used to do a set difference between the target allocations
    55  // and the existing allocations. This returns 5 sets of results, the list of
    56  // named task groups that need to be placed (no existing allocation), the
    57  // allocations that need to be updated (job definition is newer), allocs that
    58  // need to be migrated (node is draining), the allocs that need to be evicted
    59  // (no longer required), and those that should be ignored.
    60  func diffAllocs(job *structs.Job, taintedNodes map[string]bool,
    61  	required map[string]*structs.TaskGroup, allocs []*structs.Allocation) *diffResult {
    62  	result := &diffResult{}
    63  
    64  	// Scan the existing updates
    65  	existing := make(map[string]struct{})
    66  	for _, exist := range allocs {
    67  		// Index the existing node
    68  		name := exist.Name
    69  		existing[name] = struct{}{}
    70  
    71  		// Check for the definition in the required set
    72  		tg, ok := required[name]
    73  
    74  		// If not required, we stop the alloc
    75  		if !ok {
    76  			result.stop = append(result.stop, allocTuple{
    77  				Name:      name,
    78  				TaskGroup: tg,
    79  				Alloc:     exist,
    80  			})
    81  			continue
    82  		}
    83  
    84  		// If we are on a tainted node, we must migrate if we are a service or
    85  		// if the batch allocation did not finish
    86  		if taintedNodes[exist.NodeID] {
    87  			// If the job is batch and finished succesfully, the fact that the
    88  			// node is tainted does not mean it should be migrated as the work
    89  			// was already succesfully finished. However for service/system
    90  			// jobs, tasks should never complete. The check of batch type,
    91  			// defends against client bugs.
    92  			if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() {
    93  				goto IGNORE
    94  			}
    95  			result.migrate = append(result.migrate, allocTuple{
    96  				Name:      name,
    97  				TaskGroup: tg,
    98  				Alloc:     exist,
    99  			})
   100  			continue
   101  		}
   102  
   103  		// If the definition is updated we need to update
   104  		if job.JobModifyIndex != exist.Job.JobModifyIndex {
   105  			result.update = append(result.update, allocTuple{
   106  				Name:      name,
   107  				TaskGroup: tg,
   108  				Alloc:     exist,
   109  			})
   110  			continue
   111  		}
   112  
   113  		// Everything is up-to-date
   114  	IGNORE:
   115  		result.ignore = append(result.ignore, allocTuple{
   116  			Name:      name,
   117  			TaskGroup: tg,
   118  			Alloc:     exist,
   119  		})
   120  	}
   121  
   122  	// Scan the required groups
   123  	for name, tg := range required {
   124  		// Check for an existing allocation
   125  		_, ok := existing[name]
   126  
   127  		// Require a placement if no existing allocation. If there
   128  		// is an existing allocation, we would have checked for a potential
   129  		// update or ignore above.
   130  		if !ok {
   131  			result.place = append(result.place, allocTuple{
   132  				Name:      name,
   133  				TaskGroup: tg,
   134  			})
   135  		}
   136  	}
   137  	return result
   138  }
   139  
   140  // diffSystemAllocs is like diffAllocs however, the allocations in the
   141  // diffResult contain the specific nodeID they should be allocated on.
   142  func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]bool,
   143  	allocs []*structs.Allocation) *diffResult {
   144  
   145  	// Build a mapping of nodes to all their allocs.
   146  	nodeAllocs := make(map[string][]*structs.Allocation, len(allocs))
   147  	for _, alloc := range allocs {
   148  		nallocs := append(nodeAllocs[alloc.NodeID], alloc)
   149  		nodeAllocs[alloc.NodeID] = nallocs
   150  	}
   151  
   152  	for _, node := range nodes {
   153  		if _, ok := nodeAllocs[node.ID]; !ok {
   154  			nodeAllocs[node.ID] = nil
   155  		}
   156  	}
   157  
   158  	// Create the required task groups.
   159  	required := materializeTaskGroups(job)
   160  
   161  	result := &diffResult{}
   162  	for nodeID, allocs := range nodeAllocs {
   163  		diff := diffAllocs(job, taintedNodes, required, allocs)
   164  
   165  		// Mark the alloc as being for a specific node.
   166  		for i := range diff.place {
   167  			alloc := &diff.place[i]
   168  			alloc.Alloc = &structs.Allocation{NodeID: nodeID}
   169  		}
   170  
   171  		// Migrate does not apply to system jobs and instead should be marked as
   172  		// stop because if a node is tainted, the job is invalid on that node.
   173  		diff.stop = append(diff.stop, diff.migrate...)
   174  		diff.migrate = nil
   175  
   176  		result.Append(diff)
   177  	}
   178  
   179  	return result
   180  }
   181  
   182  // readyNodesInDCs returns all the ready nodes in the given datacenters and a
   183  // mapping of each data center to the count of ready nodes.
   184  func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) {
   185  	// Index the DCs
   186  	dcMap := make(map[string]int, len(dcs))
   187  	for _, dc := range dcs {
   188  		dcMap[dc] = 0
   189  	}
   190  
   191  	// Scan the nodes
   192  	var out []*structs.Node
   193  	iter, err := state.Nodes()
   194  	if err != nil {
   195  		return nil, nil, err
   196  	}
   197  	for {
   198  		raw := iter.Next()
   199  		if raw == nil {
   200  			break
   201  		}
   202  
   203  		// Filter on datacenter and status
   204  		node := raw.(*structs.Node)
   205  		if node.Status != structs.NodeStatusReady {
   206  			continue
   207  		}
   208  		if node.Drain {
   209  			continue
   210  		}
   211  		if _, ok := dcMap[node.Datacenter]; !ok {
   212  			continue
   213  		}
   214  		out = append(out, node)
   215  		dcMap[node.Datacenter] += 1
   216  	}
   217  	return out, dcMap, nil
   218  }
   219  
   220  // retryMax is used to retry a callback until it returns success or
   221  // a maximum number of attempts is reached. An optional reset function may be
   222  // passed which is called after each failed iteration. If the reset function is
   223  // set and returns true, the number of attempts is reset back to max.
   224  func retryMax(max int, cb func() (bool, error), reset func() bool) error {
   225  	attempts := 0
   226  	for attempts < max {
   227  		done, err := cb()
   228  		if err != nil {
   229  			return err
   230  		}
   231  		if done {
   232  			return nil
   233  		}
   234  
   235  		// Check if we should reset the number attempts
   236  		if reset != nil && reset() {
   237  			attempts = 0
   238  		} else {
   239  			attempts += 1
   240  		}
   241  	}
   242  	return &SetStatusError{
   243  		Err:        fmt.Errorf("maximum attempts reached (%d)", max),
   244  		EvalStatus: structs.EvalStatusFailed,
   245  	}
   246  }
   247  
   248  // progressMade checks to see if the plan result made allocations or updates.
   249  // If the result is nil, false is returned.
   250  func progressMade(result *structs.PlanResult) bool {
   251  	return result != nil && (len(result.NodeUpdate) != 0 ||
   252  		len(result.NodeAllocation) != 0)
   253  }
   254  
   255  // taintedNodes is used to scan the allocations and then check if the
   256  // underlying nodes are tainted, and should force a migration of the allocation.
   257  func taintedNodes(state State, allocs []*structs.Allocation) (map[string]bool, error) {
   258  	out := make(map[string]bool)
   259  	for _, alloc := range allocs {
   260  		if _, ok := out[alloc.NodeID]; ok {
   261  			continue
   262  		}
   263  
   264  		node, err := state.NodeByID(alloc.NodeID)
   265  		if err != nil {
   266  			return nil, err
   267  		}
   268  
   269  		// If the node does not exist, we should migrate
   270  		if node == nil {
   271  			out[alloc.NodeID] = true
   272  			continue
   273  		}
   274  
   275  		out[alloc.NodeID] = structs.ShouldDrainNode(node.Status) || node.Drain
   276  	}
   277  	return out, nil
   278  }
   279  
   280  // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm
   281  func shuffleNodes(nodes []*structs.Node) {
   282  	n := len(nodes)
   283  	for i := n - 1; i > 0; i-- {
   284  		j := rand.Intn(i + 1)
   285  		nodes[i], nodes[j] = nodes[j], nodes[i]
   286  	}
   287  }
   288  
   289  // tasksUpdated does a diff between task groups to see if the
   290  // tasks, their drivers, environment variables or config have updated.
   291  func tasksUpdated(a, b *structs.TaskGroup) bool {
   292  	// If the number of tasks do not match, clearly there is an update
   293  	if len(a.Tasks) != len(b.Tasks) {
   294  		return true
   295  	}
   296  
   297  	// Check each task
   298  	for _, at := range a.Tasks {
   299  		bt := b.LookupTask(at.Name)
   300  		if bt == nil {
   301  			return true
   302  		}
   303  		if at.Driver != bt.Driver {
   304  			return true
   305  		}
   306  		if at.User != bt.User {
   307  			return true
   308  		}
   309  		if !reflect.DeepEqual(at.Config, bt.Config) {
   310  			return true
   311  		}
   312  		if !reflect.DeepEqual(at.Env, bt.Env) {
   313  			return true
   314  		}
   315  		if !reflect.DeepEqual(at.Meta, bt.Meta) {
   316  			return true
   317  		}
   318  		if !reflect.DeepEqual(at.Artifacts, bt.Artifacts) {
   319  			return true
   320  		}
   321  
   322  		// Inspect the network to see if the dynamic ports are different
   323  		if len(at.Resources.Networks) != len(bt.Resources.Networks) {
   324  			return true
   325  		}
   326  		for idx := range at.Resources.Networks {
   327  			an := at.Resources.Networks[idx]
   328  			bn := bt.Resources.Networks[idx]
   329  
   330  			if an.MBits != bn.MBits {
   331  				return true
   332  			}
   333  
   334  			aPorts, bPorts := networkPortMap(an), networkPortMap(bn)
   335  			if !reflect.DeepEqual(aPorts, bPorts) {
   336  				return true
   337  			}
   338  		}
   339  
   340  		// Inspect the non-network resources
   341  		if ar, br := at.Resources, bt.Resources; ar.CPU != br.CPU {
   342  			return true
   343  		} else if ar.MemoryMB != br.MemoryMB {
   344  			return true
   345  		} else if ar.DiskMB != br.DiskMB {
   346  			return true
   347  		} else if ar.IOPS != br.IOPS {
   348  			return true
   349  		}
   350  	}
   351  	return false
   352  }
   353  
   354  // networkPortMap takes a network resource and returns a map of port labels to
   355  // values. The value for dynamic ports is disregarded even if it is set. This
   356  // makes this function suitable for comparing two network resources for changes.
   357  func networkPortMap(n *structs.NetworkResource) map[string]int {
   358  	m := make(map[string]int, len(n.DynamicPorts)+len(n.ReservedPorts))
   359  	for _, p := range n.ReservedPorts {
   360  		m[p.Label] = p.Value
   361  	}
   362  	for _, p := range n.DynamicPorts {
   363  		m[p.Label] = -1
   364  	}
   365  	return m
   366  }
   367  
   368  // setStatus is used to update the status of the evaluation
   369  func setStatus(logger *log.Logger, planner Planner, eval, nextEval, spawnedBlocked *structs.Evaluation, status, desc string) error {
   370  	logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status)
   371  	newEval := eval.Copy()
   372  	newEval.Status = status
   373  	newEval.StatusDescription = desc
   374  	if nextEval != nil {
   375  		newEval.NextEval = nextEval.ID
   376  	}
   377  	if spawnedBlocked != nil {
   378  		newEval.BlockedEval = spawnedBlocked.ID
   379  	}
   380  	return planner.UpdateEval(newEval)
   381  }
   382  
   383  // inplaceUpdate attempts to update allocations in-place where possible. It
   384  // returns the allocs that couldn't be done inplace and then those that could.
   385  func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job,
   386  	stack Stack, updates []allocTuple) (destructive, inplace []allocTuple) {
   387  
   388  	n := len(updates)
   389  	inplaceCount := 0
   390  	for i := 0; i < n; i++ {
   391  		// Get the update
   392  		update := updates[i]
   393  
   394  		// Check if the task drivers or config has changed, requires
   395  		// a rolling upgrade since that cannot be done in-place.
   396  		existing := update.Alloc.Job.LookupTaskGroup(update.TaskGroup.Name)
   397  		if tasksUpdated(update.TaskGroup, existing) {
   398  			continue
   399  		}
   400  
   401  		// Get the existing node
   402  		node, err := ctx.State().NodeByID(update.Alloc.NodeID)
   403  		if err != nil {
   404  			ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v",
   405  				eval, update.Alloc.NodeID, err)
   406  			continue
   407  		}
   408  		if node == nil {
   409  			continue
   410  		}
   411  
   412  		// Set the existing node as the base set
   413  		stack.SetNodes([]*structs.Node{node})
   414  
   415  		// Stage an eviction of the current allocation. This is done so that
   416  		// the current allocation is discounted when checking for feasability.
   417  		// Otherwise we would be trying to fit the tasks current resources and
   418  		// updated resources. After select is called we can remove the evict.
   419  		ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop,
   420  			allocInPlace)
   421  
   422  		// Attempt to match the task group
   423  		option, _ := stack.Select(update.TaskGroup)
   424  
   425  		// Pop the allocation
   426  		ctx.Plan().PopUpdate(update.Alloc)
   427  
   428  		// Skip if we could not do an in-place update
   429  		if option == nil {
   430  			continue
   431  		}
   432  
   433  		// Restore the network offers from the existing allocation.
   434  		// We do not allow network resources (reserved/dynamic ports)
   435  		// to be updated. This is guarded in taskUpdated, so we can
   436  		// safely restore those here.
   437  		for task, resources := range option.TaskResources {
   438  			existing := update.Alloc.TaskResources[task]
   439  			resources.Networks = existing.Networks
   440  		}
   441  
   442  		// Create a shallow copy
   443  		newAlloc := new(structs.Allocation)
   444  		*newAlloc = *update.Alloc
   445  
   446  		// Update the allocation
   447  		newAlloc.EvalID = eval.ID
   448  		newAlloc.Job = nil       // Use the Job in the Plan
   449  		newAlloc.Resources = nil // Computed in Plan Apply
   450  		newAlloc.TaskResources = option.TaskResources
   451  		newAlloc.Metrics = ctx.Metrics()
   452  		newAlloc.DesiredStatus = structs.AllocDesiredStatusRun
   453  		newAlloc.ClientStatus = structs.AllocClientStatusPending
   454  		newAlloc.PopulateServiceIDs(update.TaskGroup)
   455  		ctx.Plan().AppendAlloc(newAlloc)
   456  
   457  		// Remove this allocation from the slice
   458  		updates[i], updates[n-1] = updates[n-1], updates[i]
   459  		i--
   460  		n--
   461  		inplaceCount++
   462  	}
   463  	if len(updates) > 0 {
   464  		ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplaceCount, len(updates))
   465  	}
   466  	return updates[:n], updates[n:]
   467  }
   468  
   469  // evictAndPlace is used to mark allocations for evicts and add them to the
   470  // placement queue. evictAndPlace modifies both the the diffResult and the
   471  // limit. It returns true if the limit has been reached.
   472  func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool {
   473  	n := len(allocs)
   474  	for i := 0; i < n && i < *limit; i++ {
   475  		a := allocs[i]
   476  		ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc)
   477  		diff.place = append(diff.place, a)
   478  	}
   479  	if n <= *limit {
   480  		*limit -= n
   481  		return false
   482  	}
   483  	*limit = 0
   484  	return true
   485  }
   486  
   487  // tgConstrainTuple is used to store the total constraints of a task group.
   488  type tgConstrainTuple struct {
   489  	// Holds the combined constraints of the task group and all it's sub-tasks.
   490  	constraints []*structs.Constraint
   491  
   492  	// The set of required drivers within the task group.
   493  	drivers map[string]struct{}
   494  
   495  	// The combined resources of all tasks within the task group.
   496  	size *structs.Resources
   497  }
   498  
   499  // taskGroupConstraints collects the constraints, drivers and resources required by each
   500  // sub-task to aggregate the TaskGroup totals
   501  func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple {
   502  	c := tgConstrainTuple{
   503  		constraints: make([]*structs.Constraint, 0, len(tg.Constraints)),
   504  		drivers:     make(map[string]struct{}),
   505  		size:        new(structs.Resources),
   506  	}
   507  
   508  	c.constraints = append(c.constraints, tg.Constraints...)
   509  	for _, task := range tg.Tasks {
   510  		c.drivers[task.Driver] = struct{}{}
   511  		c.constraints = append(c.constraints, task.Constraints...)
   512  		c.size.Add(task.Resources)
   513  	}
   514  
   515  	return c
   516  }
   517  
   518  // desiredUpdates takes the diffResult as well as the set of inplace and
   519  // destructive updates and returns a map of task groups to their set of desired
   520  // updates.
   521  func desiredUpdates(diff *diffResult, inplaceUpdates,
   522  	destructiveUpdates []allocTuple) map[string]*structs.DesiredUpdates {
   523  	desiredTgs := make(map[string]*structs.DesiredUpdates)
   524  
   525  	for _, tuple := range diff.place {
   526  		name := tuple.TaskGroup.Name
   527  		des, ok := desiredTgs[name]
   528  		if !ok {
   529  			des = &structs.DesiredUpdates{}
   530  			desiredTgs[name] = des
   531  		}
   532  
   533  		des.Place++
   534  	}
   535  
   536  	for _, tuple := range diff.stop {
   537  		name := tuple.Alloc.TaskGroup
   538  		des, ok := desiredTgs[name]
   539  		if !ok {
   540  			des = &structs.DesiredUpdates{}
   541  			desiredTgs[name] = des
   542  		}
   543  
   544  		des.Stop++
   545  	}
   546  
   547  	for _, tuple := range diff.ignore {
   548  		name := tuple.TaskGroup.Name
   549  		des, ok := desiredTgs[name]
   550  		if !ok {
   551  			des = &structs.DesiredUpdates{}
   552  			desiredTgs[name] = des
   553  		}
   554  
   555  		des.Ignore++
   556  	}
   557  
   558  	for _, tuple := range diff.migrate {
   559  		name := tuple.TaskGroup.Name
   560  		des, ok := desiredTgs[name]
   561  		if !ok {
   562  			des = &structs.DesiredUpdates{}
   563  			desiredTgs[name] = des
   564  		}
   565  
   566  		des.Migrate++
   567  	}
   568  
   569  	for _, tuple := range inplaceUpdates {
   570  		name := tuple.TaskGroup.Name
   571  		des, ok := desiredTgs[name]
   572  		if !ok {
   573  			des = &structs.DesiredUpdates{}
   574  			desiredTgs[name] = des
   575  		}
   576  
   577  		des.InPlaceUpdate++
   578  	}
   579  
   580  	for _, tuple := range destructiveUpdates {
   581  		name := tuple.TaskGroup.Name
   582  		des, ok := desiredTgs[name]
   583  		if !ok {
   584  			des = &structs.DesiredUpdates{}
   585  			desiredTgs[name] = des
   586  		}
   587  
   588  		des.DestructiveUpdate++
   589  	}
   590  
   591  	return desiredTgs
   592  }