github.com/adityamillind98/nomad@v0.11.8/scheduler/reconcile_util.go (about)

     1  package scheduler
     2  
     3  import (
     4  	"fmt"
     5  	"sort"
     6  	"strings"
     7  
     8  	"time"
     9  
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  // placementResult is an allocation that must be placed. It potentially has a
    14  // previous allocation attached to it that should be stopped only if the
    15  // paired placement is complete. This gives an atomic place/stop behavior to
    16  // prevent an impossible resource ask as part of a rolling update to wipe the
    17  // job out.
    18  type placementResult interface {
    19  	// TaskGroup returns the task group the placement is for
    20  	TaskGroup() *structs.TaskGroup
    21  
    22  	// Name returns the name of the desired allocation
    23  	Name() string
    24  
    25  	// Canary returns whether the placement should be a canary
    26  	Canary() bool
    27  
    28  	// PreviousAllocation returns the previous allocation
    29  	PreviousAllocation() *structs.Allocation
    30  
    31  	// IsRescheduling returns whether the placement was rescheduling a failed allocation
    32  	IsRescheduling() bool
    33  
    34  	// StopPreviousAlloc returns whether the previous allocation should be
    35  	// stopped and if so the status description.
    36  	StopPreviousAlloc() (bool, string)
    37  }
    38  
    39  // allocStopResult contains the information required to stop a single allocation
    40  type allocStopResult struct {
    41  	alloc             *structs.Allocation
    42  	clientStatus      string
    43  	statusDescription string
    44  	followupEvalID    string
    45  }
    46  
    47  // allocPlaceResult contains the information required to place a single
    48  // allocation
    49  type allocPlaceResult struct {
    50  	name          string
    51  	canary        bool
    52  	taskGroup     *structs.TaskGroup
    53  	previousAlloc *structs.Allocation
    54  	reschedule    bool
    55  }
    56  
    57  func (a allocPlaceResult) TaskGroup() *structs.TaskGroup           { return a.taskGroup }
    58  func (a allocPlaceResult) Name() string                            { return a.name }
    59  func (a allocPlaceResult) Canary() bool                            { return a.canary }
    60  func (a allocPlaceResult) PreviousAllocation() *structs.Allocation { return a.previousAlloc }
    61  func (a allocPlaceResult) IsRescheduling() bool                    { return a.reschedule }
    62  func (a allocPlaceResult) StopPreviousAlloc() (bool, string)       { return false, "" }
    63  
    64  // allocDestructiveResult contains the information required to do a destructive
    65  // update. Destructive changes should be applied atomically, as in the old alloc
    66  // is only stopped if the new one can be placed.
    67  type allocDestructiveResult struct {
    68  	placeName             string
    69  	placeTaskGroup        *structs.TaskGroup
    70  	stopAlloc             *structs.Allocation
    71  	stopStatusDescription string
    72  }
    73  
    74  func (a allocDestructiveResult) TaskGroup() *structs.TaskGroup           { return a.placeTaskGroup }
    75  func (a allocDestructiveResult) Name() string                            { return a.placeName }
    76  func (a allocDestructiveResult) Canary() bool                            { return false }
    77  func (a allocDestructiveResult) PreviousAllocation() *structs.Allocation { return a.stopAlloc }
    78  func (a allocDestructiveResult) IsRescheduling() bool                    { return false }
    79  func (a allocDestructiveResult) StopPreviousAlloc() (bool, string) {
    80  	return true, a.stopStatusDescription
    81  }
    82  
    83  // allocMatrix is a mapping of task groups to their allocation set.
    84  type allocMatrix map[string]allocSet
    85  
    86  // newAllocMatrix takes a job and the existing allocations for the job and
    87  // creates an allocMatrix
    88  func newAllocMatrix(job *structs.Job, allocs []*structs.Allocation) allocMatrix {
    89  	m := allocMatrix(make(map[string]allocSet))
    90  	for _, a := range allocs {
    91  		s, ok := m[a.TaskGroup]
    92  		if !ok {
    93  			s = make(map[string]*structs.Allocation)
    94  			m[a.TaskGroup] = s
    95  		}
    96  		s[a.ID] = a
    97  	}
    98  
    99  	if job != nil {
   100  		for _, tg := range job.TaskGroups {
   101  			if _, ok := m[tg.Name]; !ok {
   102  				m[tg.Name] = make(map[string]*structs.Allocation)
   103  			}
   104  		}
   105  	}
   106  	return m
   107  }
   108  
   109  // allocSet is a set of allocations with a series of helper functions defined
   110  // that help reconcile state.
   111  type allocSet map[string]*structs.Allocation
   112  
   113  // GoString provides a human readable view of the set
   114  func (a allocSet) GoString() string {
   115  	if len(a) == 0 {
   116  		return "[]"
   117  	}
   118  
   119  	start := fmt.Sprintf("len(%d) [\n", len(a))
   120  	var s []string
   121  	for k, v := range a {
   122  		s = append(s, fmt.Sprintf("%q: %v", k, v.Name))
   123  	}
   124  	return start + strings.Join(s, "\n") + "]"
   125  }
   126  
   127  // nameSet returns the set of allocation names
   128  func (a allocSet) nameSet() map[string]struct{} {
   129  	names := make(map[string]struct{}, len(a))
   130  	for _, alloc := range a {
   131  		names[alloc.Name] = struct{}{}
   132  	}
   133  	return names
   134  }
   135  
   136  // nameOrder returns the set of allocation names in sorted order
   137  func (a allocSet) nameOrder() []*structs.Allocation {
   138  	allocs := make([]*structs.Allocation, 0, len(a))
   139  	for _, alloc := range a {
   140  		allocs = append(allocs, alloc)
   141  	}
   142  	sort.Slice(allocs, func(i, j int) bool {
   143  		return allocs[i].Index() < allocs[j].Index()
   144  	})
   145  	return allocs
   146  }
   147  
   148  // difference returns a new allocSet that has all the existing item except those
   149  // contained within the other allocation sets
   150  func (a allocSet) difference(others ...allocSet) allocSet {
   151  	diff := make(map[string]*structs.Allocation)
   152  OUTER:
   153  	for k, v := range a {
   154  		for _, other := range others {
   155  			if _, ok := other[k]; ok {
   156  				continue OUTER
   157  			}
   158  		}
   159  		diff[k] = v
   160  	}
   161  	return diff
   162  }
   163  
   164  // union returns a new allocSet that has the union of the two allocSets.
   165  // Conflicts prefer the last passed allocSet containing the value
   166  func (a allocSet) union(others ...allocSet) allocSet {
   167  	union := make(map[string]*structs.Allocation, len(a))
   168  	order := []allocSet{a}
   169  	order = append(order, others...)
   170  
   171  	for _, set := range order {
   172  		for k, v := range set {
   173  			union[k] = v
   174  		}
   175  	}
   176  
   177  	return union
   178  }
   179  
   180  // fromKeys returns an alloc set matching the passed keys
   181  func (a allocSet) fromKeys(keys ...[]string) allocSet {
   182  	from := make(map[string]*structs.Allocation)
   183  	for _, set := range keys {
   184  		for _, k := range set {
   185  			if alloc, ok := a[k]; ok {
   186  				from[k] = alloc
   187  			}
   188  		}
   189  	}
   190  	return from
   191  }
   192  
   193  // filterByTainted takes a set of tainted nodes and filters the allocation set
   194  // into three groups:
   195  // 1. Those that exist on untainted nodes
   196  // 2. Those exist on nodes that are draining
   197  // 3. Those that exist on lost nodes
   198  func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, migrate, lost allocSet) {
   199  	untainted = make(map[string]*structs.Allocation)
   200  	migrate = make(map[string]*structs.Allocation)
   201  	lost = make(map[string]*structs.Allocation)
   202  	for _, alloc := range a {
   203  		// Terminal allocs are always untainted as they should never be migrated
   204  		if alloc.TerminalStatus() {
   205  			untainted[alloc.ID] = alloc
   206  			continue
   207  		}
   208  
   209  		// Non-terminal allocs that should migrate should always migrate
   210  		if alloc.DesiredTransition.ShouldMigrate() {
   211  			migrate[alloc.ID] = alloc
   212  			continue
   213  		}
   214  
   215  		n, ok := nodes[alloc.NodeID]
   216  		if !ok {
   217  			// Node is untainted so alloc is untainted
   218  			untainted[alloc.ID] = alloc
   219  			continue
   220  		}
   221  
   222  		// Allocs on GC'd (nil) or lost nodes are Lost
   223  		if n == nil || n.TerminalStatus() {
   224  			lost[alloc.ID] = alloc
   225  			continue
   226  		}
   227  
   228  		// All other allocs are untainted
   229  		untainted[alloc.ID] = alloc
   230  	}
   231  	return
   232  }
   233  
   234  // filterByRescheduleable filters the allocation set to return the set of allocations that are either
   235  // untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled
   236  // at a future time are also returned so that we can create follow up evaluations for them. Allocs are
   237  // skipped or considered untainted according to logic defined in shouldFilter method.
   238  func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID string, deployment *structs.Deployment) (untainted, rescheduleNow allocSet, rescheduleLater []*delayedRescheduleInfo) {
   239  	untainted = make(map[string]*structs.Allocation)
   240  	rescheduleNow = make(map[string]*structs.Allocation)
   241  
   242  	for _, alloc := range a {
   243  		var eligibleNow, eligibleLater bool
   244  		var rescheduleTime time.Time
   245  
   246  		// Ignore allocs that have already been rescheduled
   247  		if alloc.NextAllocation != "" {
   248  			continue
   249  		}
   250  
   251  		isUntainted, ignore := shouldFilter(alloc, isBatch)
   252  		if isUntainted {
   253  			untainted[alloc.ID] = alloc
   254  		}
   255  		if isUntainted || ignore {
   256  			continue
   257  		}
   258  
   259  		// Only failed allocs with desired state run get to this point
   260  		// If the failed alloc is not eligible for rescheduling now we add it to the untainted set
   261  		eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment)
   262  		if !eligibleNow {
   263  			untainted[alloc.ID] = alloc
   264  			if eligibleLater {
   265  				rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime})
   266  			}
   267  		} else {
   268  			rescheduleNow[alloc.ID] = alloc
   269  		}
   270  	}
   271  	return
   272  }
   273  
   274  // shouldFilter returns whether the alloc should be ignored or considered untainted
   275  // Ignored allocs are filtered out.
   276  // Untainted allocs count against the desired total.
   277  // Filtering logic for batch jobs:
   278  // If complete, and ran successfully - untainted
   279  // If desired state is stop - ignore
   280  //
   281  // Filtering logic for service jobs:
   282  // If desired state is stop/evict - ignore
   283  // If client status is complete/lost - ignore
   284  func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bool) {
   285  	// Allocs from batch jobs should be filtered when the desired status
   286  	// is terminal and the client did not finish or when the client
   287  	// status is failed so that they will be replaced. If they are
   288  	// complete but not failed, they shouldn't be replaced.
   289  	if isBatch {
   290  		switch alloc.DesiredStatus {
   291  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   292  			if alloc.RanSuccessfully() {
   293  				return true, false
   294  			}
   295  			return false, true
   296  		default:
   297  		}
   298  
   299  		switch alloc.ClientStatus {
   300  		case structs.AllocClientStatusFailed:
   301  		default:
   302  			return true, false
   303  		}
   304  		return false, false
   305  	}
   306  
   307  	// Handle service jobs
   308  	switch alloc.DesiredStatus {
   309  	case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   310  		return false, true
   311  	default:
   312  	}
   313  
   314  	switch alloc.ClientStatus {
   315  	case structs.AllocClientStatusComplete, structs.AllocClientStatusLost:
   316  		return false, true
   317  	default:
   318  	}
   319  	return false, false
   320  }
   321  
   322  // updateByReschedulable is a helper method that encapsulates logic for whether a failed allocation
   323  // should be rescheduled now, later or left in the untainted set
   324  func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID string, d *structs.Deployment) (rescheduleNow, rescheduleLater bool, rescheduleTime time.Time) {
   325  	// If the allocation is part of an ongoing active deployment, we only allow it to reschedule
   326  	// if it has been marked eligible
   327  	if d != nil && alloc.DeploymentID == d.ID && d.Active() && !alloc.DesiredTransition.ShouldReschedule() {
   328  		return
   329  	}
   330  
   331  	// Check if the allocation is marked as it should be force rescheduled
   332  	if alloc.DesiredTransition.ShouldForceReschedule() {
   333  		rescheduleNow = true
   334  	}
   335  
   336  	// Reschedule if the eval ID matches the alloc's followup evalID or if its close to its reschedule time
   337  	rescheduleTime, eligible := alloc.NextRescheduleTime()
   338  	if eligible && (alloc.FollowupEvalID == evalID || rescheduleTime.Sub(now) <= rescheduleWindowSize) {
   339  		rescheduleNow = true
   340  		return
   341  	}
   342  	if eligible && alloc.FollowupEvalID == "" {
   343  		rescheduleLater = true
   344  	}
   345  	return
   346  }
   347  
   348  // filterByTerminal filters out terminal allocs
   349  func filterByTerminal(untainted allocSet) (nonTerminal allocSet) {
   350  	nonTerminal = make(map[string]*structs.Allocation)
   351  	for id, alloc := range untainted {
   352  		if !alloc.TerminalStatus() {
   353  			nonTerminal[id] = alloc
   354  		}
   355  	}
   356  	return
   357  }
   358  
   359  // filterByDeployment filters allocations into two sets, those that match the
   360  // given deployment ID and those that don't
   361  func (a allocSet) filterByDeployment(id string) (match, nonmatch allocSet) {
   362  	match = make(map[string]*structs.Allocation)
   363  	nonmatch = make(map[string]*structs.Allocation)
   364  	for _, alloc := range a {
   365  		if alloc.DeploymentID == id {
   366  			match[alloc.ID] = alloc
   367  		} else {
   368  			nonmatch[alloc.ID] = alloc
   369  		}
   370  	}
   371  	return
   372  }
   373  
   374  // delayByStopAfterClientDisconnect returns a delay for any lost allocation that's got a
   375  // stop_after_client_disconnect configured
   376  func (as allocSet) delayByStopAfterClientDisconnect() (later []*delayedRescheduleInfo) {
   377  	now := time.Now().UTC()
   378  	for _, a := range as {
   379  		if !a.ShouldClientStop() {
   380  			continue
   381  		}
   382  
   383  		t := a.WaitClientStop()
   384  
   385  		if t.After(now) {
   386  			later = append(later, &delayedRescheduleInfo{
   387  				allocID:        a.ID,
   388  				alloc:          a,
   389  				rescheduleTime: t,
   390  			})
   391  		}
   392  	}
   393  	return later
   394  }
   395  
   396  // allocNameIndex is used to select allocation names for placement or removal
   397  // given an existing set of placed allocations.
   398  type allocNameIndex struct {
   399  	job, taskGroup string
   400  	count          int
   401  	b              structs.Bitmap
   402  }
   403  
   404  // newAllocNameIndex returns an allocNameIndex for use in selecting names of
   405  // allocations to create or stop. It takes the job and task group name, desired
   406  // count and any existing allocations as input.
   407  func newAllocNameIndex(job, taskGroup string, count int, in allocSet) *allocNameIndex {
   408  	return &allocNameIndex{
   409  		count:     count,
   410  		b:         bitmapFrom(in, uint(count)),
   411  		job:       job,
   412  		taskGroup: taskGroup,
   413  	}
   414  }
   415  
   416  // bitmapFrom creates a bitmap from the given allocation set and a minimum size
   417  // maybe given. The size of the bitmap is as the larger of the passed minimum
   418  // and the maximum alloc index of the passed input (byte aligned).
   419  func bitmapFrom(input allocSet, minSize uint) structs.Bitmap {
   420  	var max uint
   421  	for _, a := range input {
   422  		if num := a.Index(); num > max {
   423  			max = num
   424  		}
   425  	}
   426  
   427  	if l := uint(len(input)); minSize < l {
   428  		minSize = l
   429  	}
   430  
   431  	if max < minSize {
   432  		max = minSize
   433  	} else if max%8 == 0 {
   434  		// This may be possible if the job was scaled down. We want to make sure
   435  		// that the max index is not byte-aligned otherwise we will overflow
   436  		// the bitmap.
   437  		max++
   438  	}
   439  
   440  	if max == 0 {
   441  		max = 8
   442  	}
   443  
   444  	// byteAlign the count
   445  	if remainder := max % 8; remainder != 0 {
   446  		max = max + 8 - remainder
   447  	}
   448  
   449  	bitmap, err := structs.NewBitmap(max)
   450  	if err != nil {
   451  		panic(err)
   452  	}
   453  
   454  	for _, a := range input {
   455  		bitmap.Set(a.Index())
   456  	}
   457  
   458  	return bitmap
   459  }
   460  
   461  // RemoveHighest removes and returns the highest n used names. The returned set
   462  // can be less than n if there aren't n names set in the index
   463  func (a *allocNameIndex) Highest(n uint) map[string]struct{} {
   464  	h := make(map[string]struct{}, n)
   465  	for i := a.b.Size(); i > uint(0) && uint(len(h)) < n; i-- {
   466  		// Use this to avoid wrapping around b/c of the unsigned int
   467  		idx := i - 1
   468  		if a.b.Check(idx) {
   469  			a.b.Unset(idx)
   470  			h[structs.AllocName(a.job, a.taskGroup, idx)] = struct{}{}
   471  		}
   472  	}
   473  
   474  	return h
   475  }
   476  
   477  // Set sets the indexes from the passed alloc set as used
   478  func (a *allocNameIndex) Set(set allocSet) {
   479  	for _, alloc := range set {
   480  		a.b.Set(alloc.Index())
   481  	}
   482  }
   483  
   484  // Unset unsets all indexes of the passed alloc set as being used
   485  func (a *allocNameIndex) Unset(as allocSet) {
   486  	for _, alloc := range as {
   487  		a.b.Unset(alloc.Index())
   488  	}
   489  }
   490  
   491  // UnsetIndex unsets the index as having its name used
   492  func (a *allocNameIndex) UnsetIndex(idx uint) {
   493  	a.b.Unset(idx)
   494  }
   495  
   496  // NextCanaries returns the next n names for use as canaries and sets them as
   497  // used. The existing canaries and destructive updates are also passed in.
   498  func (a *allocNameIndex) NextCanaries(n uint, existing, destructive allocSet) []string {
   499  	next := make([]string, 0, n)
   500  
   501  	// Create a name index
   502  	existingNames := existing.nameSet()
   503  
   504  	// First select indexes from the allocations that are undergoing destructive
   505  	// updates. This way we avoid duplicate names as they will get replaced.
   506  	dmap := bitmapFrom(destructive, uint(a.count))
   507  	remainder := n
   508  	for _, idx := range dmap.IndexesInRange(true, uint(0), uint(a.count)-1) {
   509  		name := structs.AllocName(a.job, a.taskGroup, uint(idx))
   510  		if _, used := existingNames[name]; !used {
   511  			next = append(next, name)
   512  			a.b.Set(uint(idx))
   513  
   514  			// If we have enough, return
   515  			remainder = n - uint(len(next))
   516  			if remainder == 0 {
   517  				return next
   518  			}
   519  		}
   520  	}
   521  
   522  	// Get the set of unset names that can be used
   523  	for _, idx := range a.b.IndexesInRange(false, uint(0), uint(a.count)-1) {
   524  		name := structs.AllocName(a.job, a.taskGroup, uint(idx))
   525  		if _, used := existingNames[name]; !used {
   526  			next = append(next, name)
   527  			a.b.Set(uint(idx))
   528  
   529  			// If we have enough, return
   530  			remainder = n - uint(len(next))
   531  			if remainder == 0 {
   532  				return next
   533  			}
   534  		}
   535  	}
   536  
   537  	// We have exhausted the preferred and free set. Pick starting from n to
   538  	// n+remainder, to avoid overlapping where possible. An example is the
   539  	// desired count is 3 and we want 5 canaries. The first 3 canaries can use
   540  	// index [0, 1, 2] but after that we prefer picking indexes [4, 5] so that
   541  	// we do not overlap. Once the canaries are promoted, these would be the
   542  	// allocations that would be shut down as well.
   543  	for i := uint(a.count); i < uint(a.count)+remainder; i++ {
   544  		name := structs.AllocName(a.job, a.taskGroup, i)
   545  		next = append(next, name)
   546  	}
   547  
   548  	return next
   549  }
   550  
   551  // Next returns the next n names for use as new placements and sets them as
   552  // used.
   553  func (a *allocNameIndex) Next(n uint) []string {
   554  	next := make([]string, 0, n)
   555  
   556  	// Get the set of unset names that can be used
   557  	remainder := n
   558  	for _, idx := range a.b.IndexesInRange(false, uint(0), uint(a.count)-1) {
   559  		next = append(next, structs.AllocName(a.job, a.taskGroup, uint(idx)))
   560  		a.b.Set(uint(idx))
   561  
   562  		// If we have enough, return
   563  		remainder = n - uint(len(next))
   564  		if remainder == 0 {
   565  			return next
   566  		}
   567  	}
   568  
   569  	// We have exhausted the free set, now just pick overlapping indexes
   570  	var i uint
   571  	for i = 0; i < remainder; i++ {
   572  		next = append(next, structs.AllocName(a.job, a.taskGroup, i))
   573  		a.b.Set(i)
   574  	}
   575  
   576  	return next
   577  }