github.com/ranjib/nomad@v0.1.1-0.20160225204057-97751b02f70b/scheduler/util.go

github.com/ranjib/nomad@v0.1.1-0.20160225204057-97751b02f70b/scheduler/util.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"log"
     6  	"math/rand"
     7  	"reflect"
     8  
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  )
    11  
    12  // allocTuple is a tuple of the allocation name and potential alloc ID
    13  type allocTuple struct {
    14  	Name      string
    15  	TaskGroup *structs.TaskGroup
    16  	Alloc     *structs.Allocation
    17  }
    18  
    19  // materializeTaskGroups is used to materialize all the task groups
    20  // a job requires. This is used to do the count expansion.
    21  func materializeTaskGroups(job *structs.Job) map[string]*structs.TaskGroup {
    22  	out := make(map[string]*structs.TaskGroup)
    23  	if job == nil {
    24  		return out
    25  	}
    26  
    27  	for _, tg := range job.TaskGroups {
    28  		for i := 0; i < tg.Count; i++ {
    29  			name := fmt.Sprintf("%s.%s[%d]", job.Name, tg.Name, i)
    30  			out[name] = tg
    31  		}
    32  	}
    33  	return out
    34  }
    35  
    36  // diffResult is used to return the sets that result from the diff
    37  type diffResult struct {
    38  	place, update, migrate, stop, ignore []allocTuple
    39  }
    40  
    41  func (d *diffResult) GoString() string {
    42  	return fmt.Sprintf("allocs: (place %d) (update %d) (migrate %d) (stop %d) (ignore %d)",
    43  		len(d.place), len(d.update), len(d.migrate), len(d.stop), len(d.ignore))
    44  }
    45  
    46  func (d *diffResult) Append(other *diffResult) {
    47  	d.place = append(d.place, other.place...)
    48  	d.update = append(d.update, other.update...)
    49  	d.migrate = append(d.migrate, other.migrate...)
    50  	d.stop = append(d.stop, other.stop...)
    51  	d.ignore = append(d.ignore, other.ignore...)
    52  }
    53  
    54  // diffAllocs is used to do a set difference between the target allocations
    55  // and the existing allocations. This returns 5 sets of results, the list of
    56  // named task groups that need to be placed (no existing allocation), the
    57  // allocations that need to be updated (job definition is newer), allocs that
    58  // need to be migrated (node is draining), the allocs that need to be evicted
    59  // (no longer required), and those that should be ignored.
    60  func diffAllocs(job *structs.Job, taintedNodes map[string]bool,
    61  	required map[string]*structs.TaskGroup, allocs []*structs.Allocation) *diffResult {
    62  	result := &diffResult{}
    63  
    64  	// Scan the existing updates
    65  	existing := make(map[string]struct{})
    66  	for _, exist := range allocs {
    67  		// Index the existing node
    68  		name := exist.Name
    69  		existing[name] = struct{}{}
    70  
    71  		// Check for the definition in the required set
    72  		tg, ok := required[name]
    73  
    74  		// If not required, we stop the alloc
    75  		if !ok {
    76  			result.stop = append(result.stop, allocTuple{
    77  				Name:      name,
    78  				TaskGroup: tg,
    79  				Alloc:     exist,
    80  			})
    81  			continue
    82  		}
    83  
    84  		// If we are on a tainted node, we must migrate
    85  		if taintedNodes[exist.NodeID] {
    86  			result.migrate = append(result.migrate, allocTuple{
    87  				Name:      name,
    88  				TaskGroup: tg,
    89  				Alloc:     exist,
    90  			})
    91  			continue
    92  		}
    93  
    94  		// If the definition is updated we need to update
    95  		if job.JobModifyIndex != exist.Job.JobModifyIndex {
    96  			result.update = append(result.update, allocTuple{
    97  				Name:      name,
    98  				TaskGroup: tg,
    99  				Alloc:     exist,
   100  			})
   101  			continue
   102  		}
   103  
   104  		// Everything is up-to-date
   105  		result.ignore = append(result.ignore, allocTuple{
   106  			Name:      name,
   107  			TaskGroup: tg,
   108  			Alloc:     exist,
   109  		})
   110  	}
   111  
   112  	// Scan the required groups
   113  	for name, tg := range required {
   114  		// Check for an existing allocation
   115  		_, ok := existing[name]
   116  
   117  		// Require a placement if no existing allocation. If there
   118  		// is an existing allocation, we would have checked for a potential
   119  		// update or ignore above.
   120  		if !ok {
   121  			result.place = append(result.place, allocTuple{
   122  				Name:      name,
   123  				TaskGroup: tg,
   124  			})
   125  		}
   126  	}
   127  	return result
   128  }
   129  
   130  // diffSystemAllocs is like diffAllocs however, the allocations in the
   131  // diffResult contain the specific nodeID they should be allocated on.
   132  func diffSystemAllocs(job *structs.Job, nodes []*structs.Node, taintedNodes map[string]bool,
   133  	allocs []*structs.Allocation) *diffResult {
   134  
   135  	// Build a mapping of nodes to all their allocs.
   136  	nodeAllocs := make(map[string][]*structs.Allocation, len(allocs))
   137  	for _, alloc := range allocs {
   138  		nallocs := append(nodeAllocs[alloc.NodeID], alloc)
   139  		nodeAllocs[alloc.NodeID] = nallocs
   140  	}
   141  
   142  	for _, node := range nodes {
   143  		if _, ok := nodeAllocs[node.ID]; !ok {
   144  			nodeAllocs[node.ID] = nil
   145  		}
   146  	}
   147  
   148  	// Create the required task groups.
   149  	required := materializeTaskGroups(job)
   150  
   151  	result := &diffResult{}
   152  	for nodeID, allocs := range nodeAllocs {
   153  		diff := diffAllocs(job, taintedNodes, required, allocs)
   154  
   155  		// Mark the alloc as being for a specific node.
   156  		for i := range diff.place {
   157  			alloc := &diff.place[i]
   158  			alloc.Alloc = &structs.Allocation{NodeID: nodeID}
   159  		}
   160  
   161  		// Migrate does not apply to system jobs and instead should be marked as
   162  		// stop because if a node is tainted, the job is invalid on that node.
   163  		diff.stop = append(diff.stop, diff.migrate...)
   164  		diff.migrate = nil
   165  
   166  		result.Append(diff)
   167  	}
   168  
   169  	return result
   170  }
   171  
   172  // readyNodesInDCs returns all the ready nodes in the given datacenters and a
   173  // mapping of each data center to the count of ready nodes.
   174  func readyNodesInDCs(state State, dcs []string) ([]*structs.Node, map[string]int, error) {
   175  	// Index the DCs
   176  	dcMap := make(map[string]int, len(dcs))
   177  	for _, dc := range dcs {
   178  		dcMap[dc] = 0
   179  	}
   180  
   181  	// Scan the nodes
   182  	var out []*structs.Node
   183  	iter, err := state.Nodes()
   184  	if err != nil {
   185  		return nil, nil, err
   186  	}
   187  	for {
   188  		raw := iter.Next()
   189  		if raw == nil {
   190  			break
   191  		}
   192  
   193  		// Filter on datacenter and status
   194  		node := raw.(*structs.Node)
   195  		if node.Status != structs.NodeStatusReady {
   196  			continue
   197  		}
   198  		if node.Drain {
   199  			continue
   200  		}
   201  		if _, ok := dcMap[node.Datacenter]; !ok {
   202  			continue
   203  		}
   204  		out = append(out, node)
   205  		dcMap[node.Datacenter] += 1
   206  	}
   207  	return out, dcMap, nil
   208  }
   209  
   210  // retryMax is used to retry a callback until it returns success or
   211  // a maximum number of attempts is reached. An optional reset function may be
   212  // passed which is called after each failed iteration. If the reset function is
   213  // set and returns true, the number of attempts is reset back to max.
   214  func retryMax(max int, cb func() (bool, error), reset func() bool) error {
   215  	attempts := 0
   216  	for attempts < max {
   217  		done, err := cb()
   218  		if err != nil {
   219  			return err
   220  		}
   221  		if done {
   222  			return nil
   223  		}
   224  
   225  		// Check if we should reset the number attempts
   226  		if reset != nil && reset() {
   227  			attempts = 0
   228  		} else {
   229  			attempts += 1
   230  		}
   231  	}
   232  	return &SetStatusError{
   233  		Err:        fmt.Errorf("maximum attempts reached (%d)", max),
   234  		EvalStatus: structs.EvalStatusFailed,
   235  	}
   236  }
   237  
   238  // progressMade checks to see if the plan result made allocations or updates.
   239  // If the result is nil, false is returned.
   240  func progressMade(result *structs.PlanResult) bool {
   241  	return result != nil && (len(result.NodeUpdate) != 0 ||
   242  		len(result.NodeAllocation) != 0)
   243  }
   244  
   245  // taintedNodes is used to scan the allocations and then check if the
   246  // underlying nodes are tainted, and should force a migration of the allocation.
   247  func taintedNodes(state State, allocs []*structs.Allocation) (map[string]bool, error) {
   248  	out := make(map[string]bool)
   249  	for _, alloc := range allocs {
   250  		if _, ok := out[alloc.NodeID]; ok {
   251  			continue
   252  		}
   253  
   254  		node, err := state.NodeByID(alloc.NodeID)
   255  		if err != nil {
   256  			return nil, err
   257  		}
   258  
   259  		// If the node does not exist, we should migrate
   260  		if node == nil {
   261  			out[alloc.NodeID] = true
   262  			continue
   263  		}
   264  
   265  		out[alloc.NodeID] = structs.ShouldDrainNode(node.Status) || node.Drain
   266  	}
   267  	return out, nil
   268  }
   269  
   270  // shuffleNodes randomizes the slice order with the Fisher-Yates algorithm
   271  func shuffleNodes(nodes []*structs.Node) {
   272  	n := len(nodes)
   273  	for i := n - 1; i > 0; i-- {
   274  		j := rand.Intn(i + 1)
   275  		nodes[i], nodes[j] = nodes[j], nodes[i]
   276  	}
   277  }
   278  
   279  // tasksUpdated does a diff between task groups to see if the
   280  // tasks, their drivers, environment variables or config have updated.
   281  func tasksUpdated(a, b *structs.TaskGroup) bool {
   282  	// If the number of tasks do not match, clearly there is an update
   283  	if len(a.Tasks) != len(b.Tasks) {
   284  		return true
   285  	}
   286  
   287  	// Check each task
   288  	for _, at := range a.Tasks {
   289  		bt := b.LookupTask(at.Name)
   290  		if bt == nil {
   291  			return true
   292  		}
   293  		if at.Driver != bt.Driver {
   294  			return true
   295  		}
   296  		if !reflect.DeepEqual(at.Config, bt.Config) {
   297  			return true
   298  		}
   299  		if !reflect.DeepEqual(at.Env, bt.Env) {
   300  			return true
   301  		}
   302  
   303  		// Inspect the network to see if the dynamic ports are different
   304  		if len(at.Resources.Networks) != len(bt.Resources.Networks) {
   305  			return true
   306  		}
   307  		for idx := range at.Resources.Networks {
   308  			an := at.Resources.Networks[idx]
   309  			bn := bt.Resources.Networks[idx]
   310  			if len(an.DynamicPorts) != len(bn.DynamicPorts) {
   311  				return true
   312  			}
   313  		}
   314  	}
   315  	return false
   316  }
   317  
   318  // setStatus is used to update the status of the evaluation
   319  func setStatus(logger *log.Logger, planner Planner, eval, nextEval *structs.Evaluation, status, desc string) error {
   320  	logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status)
   321  	newEval := eval.Copy()
   322  	newEval.Status = status
   323  	newEval.StatusDescription = desc
   324  	if nextEval != nil {
   325  		newEval.NextEval = nextEval.ID
   326  	}
   327  	return planner.UpdateEval(newEval)
   328  }
   329  
   330  // inplaceUpdate attempts to update allocations in-place where possible.
   331  func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job,
   332  	stack Stack, updates []allocTuple) []allocTuple {
   333  
   334  	n := len(updates)
   335  	inplace := 0
   336  	for i := 0; i < n; i++ {
   337  		// Get the update
   338  		update := updates[i]
   339  
   340  		// Check if the task drivers or config has changed, requires
   341  		// a rolling upgrade since that cannot be done in-place.
   342  		existing := update.Alloc.Job.LookupTaskGroup(update.TaskGroup.Name)
   343  		if tasksUpdated(update.TaskGroup, existing) {
   344  			continue
   345  		}
   346  
   347  		// Get the existing node
   348  		node, err := ctx.State().NodeByID(update.Alloc.NodeID)
   349  		if err != nil {
   350  			ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v",
   351  				eval, update.Alloc.NodeID, err)
   352  			continue
   353  		}
   354  		if node == nil {
   355  			continue
   356  		}
   357  
   358  		// Set the existing node as the base set
   359  		stack.SetNodes([]*structs.Node{node})
   360  
   361  		// Stage an eviction of the current allocation. This is done so that
   362  		// the current allocation is discounted when checking for feasability.
   363  		// Otherwise we would be trying to fit the tasks current resources and
   364  		// updated resources. After select is called we can remove the evict.
   365  		ctx.Plan().AppendUpdate(update.Alloc, structs.AllocDesiredStatusStop,
   366  			allocInPlace)
   367  
   368  		// Attempt to match the task group
   369  		option, size := stack.Select(update.TaskGroup)
   370  
   371  		// Pop the allocation
   372  		ctx.Plan().PopUpdate(update.Alloc)
   373  
   374  		// Skip if we could not do an in-place update
   375  		if option == nil {
   376  			continue
   377  		}
   378  
   379  		// Restore the network offers from the existing allocation.
   380  		// We do not allow network resources (reserved/dynamic ports)
   381  		// to be updated. This is guarded in taskUpdated, so we can
   382  		// safely restore those here.
   383  		for task, resources := range option.TaskResources {
   384  			existing := update.Alloc.TaskResources[task]
   385  			resources.Networks = existing.Networks
   386  		}
   387  
   388  		// Create a shallow copy
   389  		newAlloc := new(structs.Allocation)
   390  		*newAlloc = *update.Alloc
   391  
   392  		// Update the allocation
   393  		newAlloc.EvalID = eval.ID
   394  		newAlloc.Job = nil // Use the Job in the Plan
   395  		newAlloc.Resources = size
   396  		newAlloc.TaskResources = option.TaskResources
   397  		newAlloc.Metrics = ctx.Metrics()
   398  		newAlloc.DesiredStatus = structs.AllocDesiredStatusRun
   399  		newAlloc.ClientStatus = structs.AllocClientStatusPending
   400  		newAlloc.PopulateServiceIDs(update.TaskGroup)
   401  		ctx.Plan().AppendAlloc(newAlloc)
   402  
   403  		// Remove this allocation from the slice
   404  		updates[i] = updates[n-1]
   405  		i--
   406  		n--
   407  		inplace++
   408  	}
   409  	if len(updates) > 0 {
   410  		ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplace, len(updates))
   411  	}
   412  	return updates[:n]
   413  }
   414  
   415  // evictAndPlace is used to mark allocations for evicts and add them to the
   416  // placement queue. evictAndPlace modifies both the the diffResult and the
   417  // limit. It returns true if the limit has been reached.
   418  func evictAndPlace(ctx Context, diff *diffResult, allocs []allocTuple, desc string, limit *int) bool {
   419  	n := len(allocs)
   420  	for i := 0; i < n && i < *limit; i++ {
   421  		a := allocs[i]
   422  		ctx.Plan().AppendUpdate(a.Alloc, structs.AllocDesiredStatusStop, desc)
   423  		diff.place = append(diff.place, a)
   424  	}
   425  	if n <= *limit {
   426  		*limit -= n
   427  		return false
   428  	}
   429  	*limit = 0
   430  	return true
   431  }
   432  
   433  // tgConstrainTuple is used to store the total constraints of a task group.
   434  type tgConstrainTuple struct {
   435  	// Holds the combined constraints of the task group and all it's sub-tasks.
   436  	constraints []*structs.Constraint
   437  
   438  	// The set of required drivers within the task group.
   439  	drivers map[string]struct{}
   440  
   441  	// The combined resources of all tasks within the task group.
   442  	size *structs.Resources
   443  }
   444  
   445  // taskGroupConstraints collects the constraints, drivers and resources required by each
   446  // sub-task to aggregate the TaskGroup totals
   447  func taskGroupConstraints(tg *structs.TaskGroup) tgConstrainTuple {
   448  	c := tgConstrainTuple{
   449  		constraints: make([]*structs.Constraint, 0, len(tg.Constraints)),
   450  		drivers:     make(map[string]struct{}),
   451  		size:        new(structs.Resources),
   452  	}
   453  
   454  	c.constraints = append(c.constraints, tg.Constraints...)
   455  	for _, task := range tg.Tasks {
   456  		c.drivers[task.Driver] = struct{}{}
   457  		c.constraints = append(c.constraints, task.Constraints...)
   458  		c.size.Add(task.Resources)
   459  	}
   460  
   461  	return c
   462  }
   463  
   464  func initTaskState(tg *structs.TaskGroup, state string) map[string]*structs.TaskState {
   465  	states := make(map[string]*structs.TaskState, len(tg.Tasks))
   466  	for _, task := range tg.Tasks {
   467  		states[task.Name] = &structs.TaskState{State: state}
   468  	}
   469  	return states
   470  }