github.com/zhizhiboom/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/scheduler/reconcile_util.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"sort"
     6  	"strings"
     7  
     8  	"time"
     9  
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  // placementResult is an allocation that must be placed. It potentially has a
    14  // previous allocation attached to it that should be stopped only if the
    15  // paired placement is complete. This gives an atomic place/stop behavior to
    16  // prevent an impossible resource ask as part of a rolling update to wipe the
    17  // job out.
    18  type placementResult interface {
    19  	// TaskGroup returns the task group the placement is for
    20  	TaskGroup() *structs.TaskGroup
    21  
    22  	// Name returns the name of the desired allocation
    23  	Name() string
    24  
    25  	// Canary returns whether the placement should be a canary
    26  	Canary() bool
    27  
    28  	// PreviousAllocation returns the previous allocation
    29  	PreviousAllocation() *structs.Allocation
    30  
    31  	// IsRescheduling returns whether the placement was rescheduling a failed allocation
    32  	IsRescheduling() bool
    33  
    34  	// StopPreviousAlloc returns whether the previous allocation should be
    35  	// stopped and if so the status description.
    36  	StopPreviousAlloc() (bool, string)
    37  }
    38  
    39  // allocStopResult contains the information required to stop a single allocation
    40  type allocStopResult struct {
    41  	alloc             *structs.Allocation
    42  	clientStatus      string
    43  	statusDescription string
    44  }
    45  
    46  // allocPlaceResult contains the information required to place a single
    47  // allocation
    48  type allocPlaceResult struct {
    49  	name          string
    50  	canary        bool
    51  	taskGroup     *structs.TaskGroup
    52  	previousAlloc *structs.Allocation
    53  	reschedule    bool
    54  }
    55  
    56  func (a allocPlaceResult) TaskGroup() *structs.TaskGroup           { return a.taskGroup }
    57  func (a allocPlaceResult) Name() string                            { return a.name }
    58  func (a allocPlaceResult) Canary() bool                            { return a.canary }
    59  func (a allocPlaceResult) PreviousAllocation() *structs.Allocation { return a.previousAlloc }
    60  func (a allocPlaceResult) IsRescheduling() bool                    { return a.reschedule }
    61  func (a allocPlaceResult) StopPreviousAlloc() (bool, string)       { return false, "" }
    62  
    63  // allocDestructiveResult contains the information required to do a destructive
    64  // update. Destructive changes should be applied atomically, as in the old alloc
    65  // is only stopped if the new one can be placed.
    66  type allocDestructiveResult struct {
    67  	placeName             string
    68  	placeTaskGroup        *structs.TaskGroup
    69  	stopAlloc             *structs.Allocation
    70  	stopStatusDescription string
    71  }
    72  
    73  func (a allocDestructiveResult) TaskGroup() *structs.TaskGroup           { return a.placeTaskGroup }
    74  func (a allocDestructiveResult) Name() string                            { return a.placeName }
    75  func (a allocDestructiveResult) Canary() bool                            { return false }
    76  func (a allocDestructiveResult) PreviousAllocation() *structs.Allocation { return a.stopAlloc }
    77  func (a allocDestructiveResult) IsRescheduling() bool                    { return false }
    78  func (a allocDestructiveResult) StopPreviousAlloc() (bool, string) {
    79  	return true, a.stopStatusDescription
    80  }
    81  
    82  // allocMatrix is a mapping of task groups to their allocation set.
    83  type allocMatrix map[string]allocSet
    84  
    85  // newAllocMatrix takes a job and the existing allocations for the job and
    86  // creates an allocMatrix
    87  func newAllocMatrix(job *structs.Job, allocs []*structs.Allocation) allocMatrix {
    88  	m := allocMatrix(make(map[string]allocSet))
    89  	for _, a := range allocs {
    90  		s, ok := m[a.TaskGroup]
    91  		if !ok {
    92  			s = make(map[string]*structs.Allocation)
    93  			m[a.TaskGroup] = s
    94  		}
    95  		s[a.ID] = a
    96  	}
    97  
    98  	if job != nil {
    99  		for _, tg := range job.TaskGroups {
   100  			if _, ok := m[tg.Name]; !ok {
   101  				m[tg.Name] = make(map[string]*structs.Allocation)
   102  			}
   103  		}
   104  	}
   105  	return m
   106  }
   107  
   108  // allocSet is a set of allocations with a series of helper functions defined
   109  // that help reconcile state.
   110  type allocSet map[string]*structs.Allocation
   111  
   112  // GoString provides a human readable view of the set
   113  func (a allocSet) GoString() string {
   114  	if len(a) == 0 {
   115  		return "[]"
   116  	}
   117  
   118  	start := fmt.Sprintf("len(%d) [\n", len(a))
   119  	var s []string
   120  	for k, v := range a {
   121  		s = append(s, fmt.Sprintf("%q: %v", k, v.Name))
   122  	}
   123  	return start + strings.Join(s, "\n") + "]"
   124  }
   125  
   126  // nameSet returns the set of allocation names
   127  func (a allocSet) nameSet() map[string]struct{} {
   128  	names := make(map[string]struct{}, len(a))
   129  	for _, alloc := range a {
   130  		names[alloc.Name] = struct{}{}
   131  	}
   132  	return names
   133  }
   134  
   135  // nameOrder returns the set of allocation names in sorted order
   136  func (a allocSet) nameOrder() []*structs.Allocation {
   137  	allocs := make([]*structs.Allocation, 0, len(a))
   138  	for _, alloc := range a {
   139  		allocs = append(allocs, alloc)
   140  	}
   141  	sort.Slice(allocs, func(i, j int) bool {
   142  		return allocs[i].Index() < allocs[j].Index()
   143  	})
   144  	return allocs
   145  }
   146  
   147  // difference returns a new allocSet that has all the existing item except those
   148  // contained within the other allocation sets
   149  func (a allocSet) difference(others ...allocSet) allocSet {
   150  	diff := make(map[string]*structs.Allocation)
   151  OUTER:
   152  	for k, v := range a {
   153  		for _, other := range others {
   154  			if _, ok := other[k]; ok {
   155  				continue OUTER
   156  			}
   157  		}
   158  		diff[k] = v
   159  	}
   160  	return diff
   161  }
   162  
   163  // union returns a new allocSet that has the union of the two allocSets.
   164  // Conflicts prefer the last passed allocSet containing the value
   165  func (a allocSet) union(others ...allocSet) allocSet {
   166  	union := make(map[string]*structs.Allocation, len(a))
   167  	order := []allocSet{a}
   168  	order = append(order, others...)
   169  
   170  	for _, set := range order {
   171  		for k, v := range set {
   172  			union[k] = v
   173  		}
   174  	}
   175  
   176  	return union
   177  }
   178  
   179  // fromKeys returns an alloc set matching the passed keys
   180  func (a allocSet) fromKeys(keys ...[]string) allocSet {
   181  	from := make(map[string]*structs.Allocation)
   182  	for _, set := range keys {
   183  		for _, k := range set {
   184  			if alloc, ok := a[k]; ok {
   185  				from[k] = alloc
   186  			}
   187  		}
   188  	}
   189  	return from
   190  }
   191  
   192  // filterByTainted takes a set of tainted nodes and filters the allocation set
   193  // into three groups:
   194  // 1. Those that exist on untainted nodes
   195  // 2. Those exist on nodes that are draining
   196  // 3. Those that exist on lost nodes
   197  func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, migrate, lost allocSet) {
   198  	untainted = make(map[string]*structs.Allocation)
   199  	migrate = make(map[string]*structs.Allocation)
   200  	lost = make(map[string]*structs.Allocation)
   201  	for _, alloc := range a {
   202  		// Terminal allocs are always untainted as they should never be migrated
   203  		if alloc.TerminalStatus() {
   204  			untainted[alloc.ID] = alloc
   205  			continue
   206  		}
   207  
   208  		// Non-terminal allocs that should migrate should always migrate
   209  		if alloc.DesiredTransition.ShouldMigrate() {
   210  			migrate[alloc.ID] = alloc
   211  			continue
   212  		}
   213  
   214  		n, ok := nodes[alloc.NodeID]
   215  		if !ok {
   216  			// Node is untainted so alloc is untainted
   217  			untainted[alloc.ID] = alloc
   218  			continue
   219  		}
   220  
   221  		// Allocs on GC'd (nil) or lost nodes are Lost
   222  		if n == nil || n.TerminalStatus() {
   223  			lost[alloc.ID] = alloc
   224  			continue
   225  		}
   226  
   227  		// All other allocs are untainted
   228  		untainted[alloc.ID] = alloc
   229  	}
   230  	return
   231  }
   232  
   233  // filterByRescheduleable filters the allocation set to return the set of allocations that are either
   234  // untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled
   235  // at a future time are also returned so that we can create follow up evaluations for them. Allocs are
   236  // skipped or considered untainted according to logic defined in shouldFilter method.
   237  func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID string, deployment *structs.Deployment) (untainted, rescheduleNow allocSet, rescheduleLater []*delayedRescheduleInfo) {
   238  	untainted = make(map[string]*structs.Allocation)
   239  	rescheduleNow = make(map[string]*structs.Allocation)
   240  
   241  	for _, alloc := range a {
   242  		var eligibleNow, eligibleLater bool
   243  		var rescheduleTime time.Time
   244  
   245  		// Ignore allocs that have already been rescheduled
   246  		if alloc.NextAllocation != "" {
   247  			continue
   248  		}
   249  
   250  		isUntainted, ignore := shouldFilter(alloc, isBatch)
   251  		if isUntainted {
   252  			untainted[alloc.ID] = alloc
   253  		}
   254  		if isUntainted || ignore {
   255  			continue
   256  		}
   257  
   258  		// Only failed allocs with desired state run get to this point
   259  		// If the failed alloc is not eligible for rescheduling now we add it to the untainted set
   260  		eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment)
   261  		if !eligibleNow {
   262  			untainted[alloc.ID] = alloc
   263  			if eligibleLater {
   264  				rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, rescheduleTime})
   265  			}
   266  		} else {
   267  			rescheduleNow[alloc.ID] = alloc
   268  		}
   269  	}
   270  	return
   271  }
   272  
   273  // shouldFilter returns whether the alloc should be ignored or considered untainted
   274  // Ignored allocs are filtered out.
   275  // Untainted allocs count against the desired total.
   276  // Filtering logic for batch jobs:
   277  // If complete, and ran successfully - untainted
   278  // If desired state is stop - ignore
   279  //
   280  // Filtering logic for service jobs:
   281  // If desired state is stop/evict - ignore
   282  // If client status is complete/lost - ignore
   283  func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bool) {
   284  	// Allocs from batch jobs should be filtered when the desired status
   285  	// is terminal and the client did not finish or when the client
   286  	// status is failed so that they will be replaced. If they are
   287  	// complete but not failed, they shouldn't be replaced.
   288  	if isBatch {
   289  		switch alloc.DesiredStatus {
   290  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   291  			if alloc.RanSuccessfully() {
   292  				return true, false
   293  			}
   294  			return false, true
   295  		default:
   296  		}
   297  
   298  		switch alloc.ClientStatus {
   299  		case structs.AllocClientStatusFailed:
   300  		default:
   301  			return true, false
   302  		}
   303  		return false, false
   304  	}
   305  
   306  	// Handle service jobs
   307  	switch alloc.DesiredStatus {
   308  	case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   309  		return false, true
   310  	default:
   311  	}
   312  
   313  	switch alloc.ClientStatus {
   314  	case structs.AllocClientStatusComplete, structs.AllocClientStatusLost:
   315  		return false, true
   316  	default:
   317  	}
   318  	return false, false
   319  }
   320  
   321  // updateByReschedulable is a helper method that encapsulates logic for whether a failed allocation
   322  // should be rescheduled now, later or left in the untainted set
   323  func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID string, d *structs.Deployment) (rescheduleNow, rescheduleLater bool, rescheduleTime time.Time) {
   324  	// If the allocation is part of an ongoing active deployment, we only allow it to reschedule
   325  	// if it has been marked eligible
   326  	if d != nil && alloc.DeploymentID == d.ID && d.Active() && !alloc.DesiredTransition.ShouldReschedule() {
   327  		return
   328  	}
   329  
   330  	// Check if the allocation is marked as it should be force rescheduled
   331  	if alloc.DesiredTransition.ShouldForceReschedule() {
   332  		rescheduleNow = true
   333  	}
   334  
   335  	// Reschedule if the eval ID matches the alloc's followup evalID or if its close to its reschedule time
   336  	rescheduleTime, eligible := alloc.NextRescheduleTime()
   337  	if eligible && (alloc.FollowupEvalID == evalID || rescheduleTime.Sub(now) <= rescheduleWindowSize) {
   338  		rescheduleNow = true
   339  		return
   340  	}
   341  	if eligible && alloc.FollowupEvalID == "" {
   342  		rescheduleLater = true
   343  	}
   344  	return
   345  }
   346  
   347  // filterByTerminal filters out terminal allocs
   348  func filterByTerminal(untainted allocSet) (nonTerminal allocSet) {
   349  	nonTerminal = make(map[string]*structs.Allocation)
   350  	for id, alloc := range untainted {
   351  		if !alloc.TerminalStatus() {
   352  			nonTerminal[id] = alloc
   353  		}
   354  	}
   355  	return
   356  }
   357  
   358  // filterByDeployment filters allocations into two sets, those that match the
   359  // given deployment ID and those that don't
   360  func (a allocSet) filterByDeployment(id string) (match, nonmatch allocSet) {
   361  	match = make(map[string]*structs.Allocation)
   362  	nonmatch = make(map[string]*structs.Allocation)
   363  	for _, alloc := range a {
   364  		if alloc.DeploymentID == id {
   365  			match[alloc.ID] = alloc
   366  		} else {
   367  			nonmatch[alloc.ID] = alloc
   368  		}
   369  	}
   370  	return
   371  }
   372  
   373  // allocNameIndex is used to select allocation names for placement or removal
   374  // given an existing set of placed allocations.
   375  type allocNameIndex struct {
   376  	job, taskGroup string
   377  	count          int
   378  	b              structs.Bitmap
   379  }
   380  
   381  // newAllocNameIndex returns an allocNameIndex for use in selecting names of
   382  // allocations to create or stop. It takes the job and task group name, desired
   383  // count and any existing allocations as input.
   384  func newAllocNameIndex(job, taskGroup string, count int, in allocSet) *allocNameIndex {
   385  	return &allocNameIndex{
   386  		count:     count,
   387  		b:         bitmapFrom(in, uint(count)),
   388  		job:       job,
   389  		taskGroup: taskGroup,
   390  	}
   391  }
   392  
   393  // bitmapFrom creates a bitmap from the given allocation set and a minimum size
   394  // maybe given. The size of the bitmap is as the larger of the passed minimum
   395  // and the maximum alloc index of the passed input (byte aligned).
   396  func bitmapFrom(input allocSet, minSize uint) structs.Bitmap {
   397  	var max uint
   398  	for _, a := range input {
   399  		if num := a.Index(); num > max {
   400  			max = num
   401  		}
   402  	}
   403  
   404  	if l := uint(len(input)); minSize < l {
   405  		minSize = l
   406  	}
   407  
   408  	if max < minSize {
   409  		max = minSize
   410  	} else if max%8 == 0 {
   411  		// This may be possible if the job was scaled down. We want to make sure
   412  		// that the max index is not byte-aligned otherwise we will overflow
   413  		// the bitmap.
   414  		max++
   415  	}
   416  
   417  	if max == 0 {
   418  		max = 8
   419  	}
   420  
   421  	// byteAlign the count
   422  	if remainder := max % 8; remainder != 0 {
   423  		max = max + 8 - remainder
   424  	}
   425  
   426  	bitmap, err := structs.NewBitmap(max)
   427  	if err != nil {
   428  		panic(err)
   429  	}
   430  
   431  	for _, a := range input {
   432  		bitmap.Set(a.Index())
   433  	}
   434  
   435  	return bitmap
   436  }
   437  
   438  // RemoveHighest removes and returns the highest n used names. The returned set
   439  // can be less than n if there aren't n names set in the index
   440  func (a *allocNameIndex) Highest(n uint) map[string]struct{} {
   441  	h := make(map[string]struct{}, n)
   442  	for i := a.b.Size(); i > uint(0) && uint(len(h)) < n; i-- {
   443  		// Use this to avoid wrapping around b/c of the unsigned int
   444  		idx := i - 1
   445  		if a.b.Check(idx) {
   446  			a.b.Unset(idx)
   447  			h[structs.AllocName(a.job, a.taskGroup, idx)] = struct{}{}
   448  		}
   449  	}
   450  
   451  	return h
   452  }
   453  
   454  // Set sets the indexes from the passed alloc set as used
   455  func (a *allocNameIndex) Set(set allocSet) {
   456  	for _, alloc := range set {
   457  		a.b.Set(alloc.Index())
   458  	}
   459  }
   460  
   461  // Unset unsets all indexes of the passed alloc set as being used
   462  func (a *allocNameIndex) Unset(as allocSet) {
   463  	for _, alloc := range as {
   464  		a.b.Unset(alloc.Index())
   465  	}
   466  }
   467  
   468  // UnsetIndex unsets the index as having its name used
   469  func (a *allocNameIndex) UnsetIndex(idx uint) {
   470  	a.b.Unset(idx)
   471  }
   472  
   473  // NextCanaries returns the next n names for use as canaries and sets them as
   474  // used. The existing canaries and destructive updates are also passed in.
   475  func (a *allocNameIndex) NextCanaries(n uint, existing, destructive allocSet) []string {
   476  	next := make([]string, 0, n)
   477  
   478  	// Create a name index
   479  	existingNames := existing.nameSet()
   480  
   481  	// First select indexes from the allocations that are undergoing destructive
   482  	// updates. This way we avoid duplicate names as they will get replaced.
   483  	dmap := bitmapFrom(destructive, uint(a.count))
   484  	remainder := n
   485  	for _, idx := range dmap.IndexesInRange(true, uint(0), uint(a.count)-1) {
   486  		name := structs.AllocName(a.job, a.taskGroup, uint(idx))
   487  		if _, used := existingNames[name]; !used {
   488  			next = append(next, name)
   489  			a.b.Set(uint(idx))
   490  
   491  			// If we have enough, return
   492  			remainder = n - uint(len(next))
   493  			if remainder == 0 {
   494  				return next
   495  			}
   496  		}
   497  	}
   498  
   499  	// Get the set of unset names that can be used
   500  	for _, idx := range a.b.IndexesInRange(false, uint(0), uint(a.count)-1) {
   501  		name := structs.AllocName(a.job, a.taskGroup, uint(idx))
   502  		if _, used := existingNames[name]; !used {
   503  			next = append(next, name)
   504  			a.b.Set(uint(idx))
   505  
   506  			// If we have enough, return
   507  			remainder = n - uint(len(next))
   508  			if remainder == 0 {
   509  				return next
   510  			}
   511  		}
   512  	}
   513  
   514  	// We have exhausted the preferred and free set. Pick starting from n to
   515  	// n+remainder, to avoid overlapping where possible. An example is the
   516  	// desired count is 3 and we want 5 canaries. The first 3 canaries can use
   517  	// index [0, 1, 2] but after that we prefer picking indexes [4, 5] so that
   518  	// we do not overlap. Once the canaries are promoted, these would be the
   519  	// allocations that would be shut down as well.
   520  	for i := uint(a.count); i < uint(a.count)+remainder; i++ {
   521  		name := structs.AllocName(a.job, a.taskGroup, i)
   522  		next = append(next, name)
   523  	}
   524  
   525  	return next
   526  }
   527  
   528  // Next returns the next n names for use as new placements and sets them as
   529  // used.
   530  func (a *allocNameIndex) Next(n uint) []string {
   531  	next := make([]string, 0, n)
   532  
   533  	// Get the set of unset names that can be used
   534  	remainder := n
   535  	for _, idx := range a.b.IndexesInRange(false, uint(0), uint(a.count)-1) {
   536  		next = append(next, structs.AllocName(a.job, a.taskGroup, uint(idx)))
   537  		a.b.Set(uint(idx))
   538  
   539  		// If we have enough, return
   540  		remainder = n - uint(len(next))
   541  		if remainder == 0 {
   542  			return next
   543  		}
   544  	}
   545  
   546  	// We have exhausted the free set, now just pick overlapping indexes
   547  	var i uint
   548  	for i = 0; i < remainder; i++ {
   549  		next = append(next, structs.AllocName(a.job, a.taskGroup, i))
   550  		a.b.Set(i)
   551  	}
   552  
   553  	return next
   554  }