github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/blocked_evals.go (about)

     1  package nomad
     2  
     3  import (
     4  	"sync"
     5  	"time"
     6  
     7  	"github.com/armon/go-metrics"
     8  	"github.com/hashicorp/consul/lib"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  )
    11  
    12  const (
    13  	// unblockBuffer is the buffer size for the unblock channel. The buffer
    14  	// should be large to ensure that the FSM doesn't block when calling Unblock
    15  	// as this would apply back-pressure on Raft.
    16  	unblockBuffer = 8096
    17  
    18  	// pruneInterval is the interval at which we prune objects from the
    19  	// BlockedEvals tracker
    20  	pruneInterval = 5 * time.Minute
    21  
    22  	// pruneThreshold is the threshold after which objects will be pruned.
    23  	pruneThreshold = 15 * time.Minute
    24  )
    25  
    26  // BlockedEvals is used to track evaluations that shouldn't be queued until a
    27  // certain class of nodes becomes available. An evaluation is put into the
    28  // blocked state when it is run through the scheduler and produced failed
    29  // allocations. It is unblocked when the capacity of a node that could run the
    30  // failed allocation becomes available.
    31  type BlockedEvals struct {
    32  	evalBroker *EvalBroker
    33  	enabled    bool
    34  	stats      *BlockedStats
    35  	l          sync.RWMutex
    36  
    37  	// captured is the set of evaluations that are captured by computed node
    38  	// classes.
    39  	captured map[string]wrappedEval
    40  
    41  	// escaped is the set of evaluations that have escaped computed node
    42  	// classes.
    43  	escaped map[string]wrappedEval
    44  
    45  	// unblockCh is used to buffer unblocking of evaluations.
    46  	capacityChangeCh chan *capacityUpdate
    47  
    48  	// jobs is the map of blocked job and is used to ensure that only one
    49  	// blocked eval exists for each job. The value is the blocked evaluation ID.
    50  	jobs map[string]string
    51  
    52  	// unblockIndexes maps computed node classes or quota name to the index in
    53  	// which they were unblocked. This is used to check if an evaluation could
    54  	// have been unblocked between the time they were in the scheduler and the
    55  	// time they are being blocked.
    56  	unblockIndexes map[string]uint64
    57  
    58  	// duplicates is the set of evaluations for jobs that had pre-existing
    59  	// blocked evaluations. These should be marked as cancelled since only one
    60  	// blocked eval is needed per job.
    61  	duplicates []*structs.Evaluation
    62  
    63  	// duplicateCh is used to signal that a duplicate eval was added to the
    64  	// duplicate set. It can be used to unblock waiting callers looking for
    65  	// duplicates.
    66  	duplicateCh chan struct{}
    67  
    68  	// timetable is used to correlate indexes with their insertion time. This
    69  	// allows us to prune based on time.
    70  	timetable *TimeTable
    71  
    72  	// stopCh is used to stop any created goroutines.
    73  	stopCh chan struct{}
    74  }
    75  
    76  // capacityUpdate stores unblock data.
    77  type capacityUpdate struct {
    78  	computedClass string
    79  	quotaChange   string
    80  	index         uint64
    81  }
    82  
    83  // wrappedEval captures both the evaluation and the optional token
    84  type wrappedEval struct {
    85  	eval  *structs.Evaluation
    86  	token string
    87  }
    88  
    89  // BlockedStats returns all the stats about the blocked eval tracker.
    90  type BlockedStats struct {
    91  	// TotalEscaped is the total number of blocked evaluations that have escaped
    92  	// computed node classes.
    93  	TotalEscaped int
    94  
    95  	// TotalBlocked is the total number of blocked evaluations.
    96  	TotalBlocked int
    97  
    98  	// TotalQuotaLimit is the total number of blocked evaluations that are due
    99  	// to the quota limit being reached.
   100  	TotalQuotaLimit int
   101  }
   102  
   103  // NewBlockedEvals creates a new blocked eval tracker that will enqueue
   104  // unblocked evals into the passed broker.
   105  func NewBlockedEvals(evalBroker *EvalBroker) *BlockedEvals {
   106  	return &BlockedEvals{
   107  		evalBroker:       evalBroker,
   108  		captured:         make(map[string]wrappedEval),
   109  		escaped:          make(map[string]wrappedEval),
   110  		jobs:             make(map[string]string),
   111  		unblockIndexes:   make(map[string]uint64),
   112  		capacityChangeCh: make(chan *capacityUpdate, unblockBuffer),
   113  		duplicateCh:      make(chan struct{}, 1),
   114  		stopCh:           make(chan struct{}),
   115  		stats:            new(BlockedStats),
   116  	}
   117  }
   118  
   119  // Enabled is used to check if the broker is enabled.
   120  func (b *BlockedEvals) Enabled() bool {
   121  	b.l.RLock()
   122  	defer b.l.RUnlock()
   123  	return b.enabled
   124  }
   125  
   126  // SetEnabled is used to control if the blocked eval tracker is enabled. The
   127  // tracker should only be enabled on the active leader.
   128  func (b *BlockedEvals) SetEnabled(enabled bool) {
   129  	b.l.Lock()
   130  	if b.enabled == enabled {
   131  		// No-op
   132  		b.l.Unlock()
   133  		return
   134  	} else if enabled {
   135  		go b.watchCapacity()
   136  		go b.prune()
   137  	} else {
   138  		close(b.stopCh)
   139  	}
   140  	b.enabled = enabled
   141  	b.l.Unlock()
   142  	if !enabled {
   143  		b.Flush()
   144  	}
   145  }
   146  
   147  func (b *BlockedEvals) SetTimetable(timetable *TimeTable) {
   148  	b.l.Lock()
   149  	b.timetable = timetable
   150  	b.l.Unlock()
   151  }
   152  
   153  // Block tracks the passed evaluation and enqueues it into the eval broker when
   154  // a suitable node calls unblock.
   155  func (b *BlockedEvals) Block(eval *structs.Evaluation) {
   156  	b.processBlock(eval, "")
   157  }
   158  
   159  // Reblock tracks the passed evaluation and enqueues it into the eval broker when
   160  // a suitable node calls unblock. Reblock should be used over Block when the
   161  // blocking is occurring by an outstanding evaluation. The token is the
   162  // evaluation's token.
   163  func (b *BlockedEvals) Reblock(eval *structs.Evaluation, token string) {
   164  	b.processBlock(eval, token)
   165  }
   166  
   167  // processBlock is the implementation of blocking an evaluation. It supports
   168  // taking an optional evaluation token to use when reblocking an evaluation that
   169  // may be outstanding.
   170  func (b *BlockedEvals) processBlock(eval *structs.Evaluation, token string) {
   171  	b.l.Lock()
   172  	defer b.l.Unlock()
   173  
   174  	// Do nothing if not enabled
   175  	if !b.enabled {
   176  		return
   177  	}
   178  
   179  	// Check if the job already has a blocked evaluation. If it does add it to
   180  	// the list of duplicates. We only ever want one blocked evaluation per job,
   181  	// otherwise we would create unnecessary work for the scheduler as multiple
   182  	// evals for the same job would be run, all producing the same outcome.
   183  	if _, existing := b.jobs[eval.JobID]; existing {
   184  		b.duplicates = append(b.duplicates, eval)
   185  
   186  		// Unblock any waiter.
   187  		select {
   188  		case b.duplicateCh <- struct{}{}:
   189  		default:
   190  		}
   191  
   192  		return
   193  	}
   194  
   195  	// Check if the eval missed an unblock while it was in the scheduler at an
   196  	// older index. The scheduler could have been invoked with a snapshot of
   197  	// state that was prior to additional capacity being added or allocations
   198  	// becoming terminal.
   199  	if b.missedUnblock(eval) {
   200  		// Just re-enqueue the eval immediately. We pass the token so that the
   201  		// eval_broker can properly handle the case in which the evaluation is
   202  		// still outstanding.
   203  		b.evalBroker.EnqueueAll(map[*structs.Evaluation]string{eval: token})
   204  		return
   205  	}
   206  
   207  	// Mark the job as tracked.
   208  	b.jobs[eval.JobID] = eval.ID
   209  	b.stats.TotalBlocked++
   210  
   211  	// Track that the evaluation is being added due to reaching the quota limit
   212  	if eval.QuotaLimitReached != "" {
   213  		b.stats.TotalQuotaLimit++
   214  	}
   215  
   216  	// Wrap the evaluation, capturing its token.
   217  	wrapped := wrappedEval{
   218  		eval:  eval,
   219  		token: token,
   220  	}
   221  
   222  	// If the eval has escaped, meaning computed node classes could not capture
   223  	// the constraints of the job, we store the eval separately as we have to
   224  	// unblock it whenever node capacity changes. This is because we don't know
   225  	// what node class is feasible for the jobs constraints.
   226  	if eval.EscapedComputedClass {
   227  		b.escaped[eval.ID] = wrapped
   228  		b.stats.TotalEscaped++
   229  		return
   230  	}
   231  
   232  	// Add the eval to the set of blocked evals whose jobs constraints are
   233  	// captured by computed node class.
   234  	b.captured[eval.ID] = wrapped
   235  }
   236  
   237  // missedUnblock returns whether an evaluation missed an unblock while it was in
   238  // the scheduler. Since the scheduler can operate at an index in the past, the
   239  // evaluation may have been processed missing data that would allow it to
   240  // complete. This method returns if that is the case and should be called with
   241  // the lock held.
   242  func (b *BlockedEvals) missedUnblock(eval *structs.Evaluation) bool {
   243  	var max uint64 = 0
   244  	for id, index := range b.unblockIndexes {
   245  		// Calculate the max unblock index
   246  		if max < index {
   247  			max = index
   248  		}
   249  
   250  		// The evaluation is blocked because it has hit a quota limit not class
   251  		// eligibility
   252  		if eval.QuotaLimitReached != "" {
   253  			if eval.QuotaLimitReached != id {
   254  				// Not a match
   255  				continue
   256  			} else if eval.SnapshotIndex < index {
   257  				// The evaluation was processed before the quota specification was
   258  				// updated, so unblock the evaluation.
   259  				return true
   260  			}
   261  
   262  			// The evaluation was processed having seen all changes to the quota
   263  			return false
   264  		}
   265  
   266  		elig, ok := eval.ClassEligibility[id]
   267  		if !ok && eval.SnapshotIndex < index {
   268  			// The evaluation was processed and did not encounter this class
   269  			// because it was added after it was processed. Thus for correctness
   270  			// we need to unblock it.
   271  			return true
   272  		}
   273  
   274  		// The evaluation could use the computed node class and the eval was
   275  		// processed before the last unblock.
   276  		if elig && eval.SnapshotIndex < index {
   277  			return true
   278  		}
   279  	}
   280  
   281  	// If the evaluation has escaped, and the map contains an index older than
   282  	// the evaluations, it should be unblocked.
   283  	if eval.EscapedComputedClass && eval.SnapshotIndex < max {
   284  		return true
   285  	}
   286  
   287  	// The evaluation is ahead of all recent unblocks.
   288  	return false
   289  }
   290  
   291  // Untrack causes any blocked evaluation for the passed job to be no longer
   292  // tracked. Untrack is called when there is a successful evaluation for the job
   293  // and a blocked evaluation is no longer needed.
   294  func (b *BlockedEvals) Untrack(jobID string) {
   295  	b.l.Lock()
   296  	defer b.l.Unlock()
   297  
   298  	// Do nothing if not enabled
   299  	if !b.enabled {
   300  		return
   301  	}
   302  
   303  	// Get the evaluation ID to cancel
   304  	evalID, ok := b.jobs[jobID]
   305  	if !ok {
   306  		// No blocked evaluation so exit
   307  		return
   308  	}
   309  
   310  	// Attempt to delete the evaluation
   311  	if w, ok := b.captured[evalID]; ok {
   312  		delete(b.jobs, w.eval.JobID)
   313  		delete(b.captured, evalID)
   314  		b.stats.TotalBlocked--
   315  		if w.eval.QuotaLimitReached != "" {
   316  			b.stats.TotalQuotaLimit--
   317  		}
   318  	}
   319  
   320  	if w, ok := b.escaped[evalID]; ok {
   321  		delete(b.jobs, w.eval.JobID)
   322  		delete(b.escaped, evalID)
   323  		b.stats.TotalEscaped--
   324  		b.stats.TotalBlocked--
   325  		if w.eval.QuotaLimitReached != "" {
   326  			b.stats.TotalQuotaLimit--
   327  		}
   328  	}
   329  }
   330  
   331  // Unblock causes any evaluation that could potentially make progress on a
   332  // capacity change on the passed computed node class to be enqueued into the
   333  // eval broker.
   334  func (b *BlockedEvals) Unblock(computedClass string, index uint64) {
   335  	b.l.Lock()
   336  
   337  	// Do nothing if not enabled
   338  	if !b.enabled {
   339  		b.l.Unlock()
   340  		return
   341  	}
   342  
   343  	// Store the index in which the unblock happened. We use this on subsequent
   344  	// block calls in case the evaluation was in the scheduler when a trigger
   345  	// occurred.
   346  	b.unblockIndexes[computedClass] = index
   347  	b.l.Unlock()
   348  
   349  	b.capacityChangeCh <- &capacityUpdate{
   350  		computedClass: computedClass,
   351  		index:         index,
   352  	}
   353  }
   354  
   355  // UnblockQuota causes any evaluation that could potentially make progress on a
   356  // capacity change on the passed quota to be enqueued into the eval broker.
   357  func (b *BlockedEvals) UnblockQuota(quota string, index uint64) {
   358  	// Nothing to do
   359  	if quota == "" {
   360  		return
   361  	}
   362  
   363  	b.l.Lock()
   364  
   365  	// Do nothing if not enabled
   366  	if !b.enabled {
   367  		b.l.Unlock()
   368  		return
   369  	}
   370  
   371  	// Store the index in which the unblock happened. We use this on subsequent
   372  	// block calls in case the evaluation was in the scheduler when a trigger
   373  	// occurred.
   374  	b.unblockIndexes[quota] = index
   375  	b.l.Unlock()
   376  
   377  	b.capacityChangeCh <- &capacityUpdate{
   378  		quotaChange: quota,
   379  		index:       index,
   380  	}
   381  }
   382  
   383  // UnblockClassAndQuota causes any evaluation that could potentially make
   384  // progress on a capacity change on the passed computed node class or quota to
   385  // be enqueued into the eval broker.
   386  func (b *BlockedEvals) UnblockClassAndQuota(class, quota string, index uint64) {
   387  	b.l.Lock()
   388  
   389  	// Do nothing if not enabled
   390  	if !b.enabled {
   391  		b.l.Unlock()
   392  		return
   393  	}
   394  
   395  	// Store the index in which the unblock happened. We use this on subsequent
   396  	// block calls in case the evaluation was in the scheduler when a trigger
   397  	// occurred.
   398  	if quota != "" {
   399  		b.unblockIndexes[quota] = index
   400  	}
   401  	b.unblockIndexes[class] = index
   402  	b.l.Unlock()
   403  
   404  	b.capacityChangeCh <- &capacityUpdate{
   405  		computedClass: class,
   406  		quotaChange:   quota,
   407  		index:         index,
   408  	}
   409  }
   410  
   411  // watchCapacity is a long lived function that watches for capacity changes in
   412  // nodes and unblocks the correct set of evals.
   413  func (b *BlockedEvals) watchCapacity() {
   414  	for {
   415  		select {
   416  		case <-b.stopCh:
   417  			return
   418  		case update := <-b.capacityChangeCh:
   419  			b.unblock(update.computedClass, update.quotaChange, update.index)
   420  		}
   421  	}
   422  }
   423  
   424  func (b *BlockedEvals) unblock(computedClass, quota string, index uint64) {
   425  	b.l.Lock()
   426  	defer b.l.Unlock()
   427  
   428  	// Protect against the case of a flush.
   429  	if !b.enabled {
   430  		return
   431  	}
   432  
   433  	// Every eval that has escaped computed node class has to be unblocked
   434  	// because any node could potentially be feasible.
   435  	numEscaped := len(b.escaped)
   436  	numQuotaLimit := 0
   437  	unblocked := make(map[*structs.Evaluation]string, lib.MaxInt(numEscaped, 4))
   438  
   439  	if numEscaped != 0 && computedClass != "" {
   440  		for id, wrapped := range b.escaped {
   441  			unblocked[wrapped.eval] = wrapped.token
   442  			delete(b.escaped, id)
   443  			delete(b.jobs, wrapped.eval.JobID)
   444  
   445  			if wrapped.eval.QuotaLimitReached != "" {
   446  				numQuotaLimit++
   447  			}
   448  		}
   449  	}
   450  
   451  	// We unblock any eval that is explicitly eligible for the computed class
   452  	// and also any eval that is not eligible or uneligible. This signifies that
   453  	// when the evaluation was originally run through the scheduler, that it
   454  	// never saw a node with the given computed class and thus needs to be
   455  	// unblocked for correctness.
   456  	for id, wrapped := range b.captured {
   457  		if quota != "" && wrapped.eval.QuotaLimitReached != quota {
   458  			// We are unblocking based on quota and this eval doesn't match
   459  			continue
   460  		} else if elig, ok := wrapped.eval.ClassEligibility[computedClass]; ok && !elig {
   461  			// Can skip because the eval has explicitly marked the node class
   462  			// as ineligible.
   463  			continue
   464  		}
   465  
   466  		// Unblock the evaluation because it is either for the matching quota,
   467  		// is eligible based on the computed node class, or never seen the
   468  		// computed node class.
   469  		unblocked[wrapped.eval] = wrapped.token
   470  		delete(b.jobs, wrapped.eval.JobID)
   471  		delete(b.captured, id)
   472  		if wrapped.eval.QuotaLimitReached != "" {
   473  			numQuotaLimit++
   474  		}
   475  	}
   476  
   477  	if l := len(unblocked); l != 0 {
   478  		// Update the counters
   479  		b.stats.TotalEscaped = 0
   480  		b.stats.TotalBlocked -= l
   481  		b.stats.TotalQuotaLimit -= numQuotaLimit
   482  
   483  		// Enqueue all the unblocked evals into the broker.
   484  		b.evalBroker.EnqueueAll(unblocked)
   485  	}
   486  }
   487  
   488  // UnblockFailed unblocks all blocked evaluation that were due to scheduler
   489  // failure.
   490  func (b *BlockedEvals) UnblockFailed() {
   491  	b.l.Lock()
   492  	defer b.l.Unlock()
   493  
   494  	// Do nothing if not enabled
   495  	if !b.enabled {
   496  		return
   497  	}
   498  
   499  	quotaLimit := 0
   500  	unblocked := make(map[*structs.Evaluation]string, 4)
   501  	for id, wrapped := range b.captured {
   502  		if wrapped.eval.TriggeredBy == structs.EvalTriggerMaxPlans {
   503  			unblocked[wrapped.eval] = wrapped.token
   504  			delete(b.captured, id)
   505  			delete(b.jobs, wrapped.eval.JobID)
   506  			if wrapped.eval.QuotaLimitReached != "" {
   507  				quotaLimit++
   508  			}
   509  		}
   510  	}
   511  
   512  	for id, wrapped := range b.escaped {
   513  		if wrapped.eval.TriggeredBy == structs.EvalTriggerMaxPlans {
   514  			unblocked[wrapped.eval] = wrapped.token
   515  			delete(b.escaped, id)
   516  			delete(b.jobs, wrapped.eval.JobID)
   517  			b.stats.TotalEscaped -= 1
   518  			if wrapped.eval.QuotaLimitReached != "" {
   519  				quotaLimit++
   520  			}
   521  		}
   522  	}
   523  
   524  	if l := len(unblocked); l > 0 {
   525  		b.stats.TotalBlocked -= l
   526  		b.stats.TotalQuotaLimit -= quotaLimit
   527  		b.evalBroker.EnqueueAll(unblocked)
   528  	}
   529  }
   530  
   531  // GetDuplicates returns all the duplicate evaluations and blocks until the
   532  // passed timeout.
   533  func (b *BlockedEvals) GetDuplicates(timeout time.Duration) []*structs.Evaluation {
   534  	var timeoutTimer *time.Timer
   535  	var timeoutCh <-chan time.Time
   536  SCAN:
   537  	b.l.Lock()
   538  	if len(b.duplicates) != 0 {
   539  		dups := b.duplicates
   540  		b.duplicates = nil
   541  		b.l.Unlock()
   542  		return dups
   543  	}
   544  	b.l.Unlock()
   545  
   546  	// Create the timer
   547  	if timeoutTimer == nil && timeout != 0 {
   548  		timeoutTimer = time.NewTimer(timeout)
   549  		timeoutCh = timeoutTimer.C
   550  		defer timeoutTimer.Stop()
   551  	}
   552  
   553  	select {
   554  	case <-b.stopCh:
   555  		return nil
   556  	case <-timeoutCh:
   557  		return nil
   558  	case <-b.duplicateCh:
   559  		goto SCAN
   560  	}
   561  }
   562  
   563  // Flush is used to clear the state of blocked evaluations.
   564  func (b *BlockedEvals) Flush() {
   565  	b.l.Lock()
   566  	defer b.l.Unlock()
   567  
   568  	// Reset the blocked eval tracker.
   569  	b.stats.TotalEscaped = 0
   570  	b.stats.TotalBlocked = 0
   571  	b.stats.TotalQuotaLimit = 0
   572  	b.captured = make(map[string]wrappedEval)
   573  	b.escaped = make(map[string]wrappedEval)
   574  	b.jobs = make(map[string]string)
   575  	b.unblockIndexes = make(map[string]uint64)
   576  	b.timetable = nil
   577  	b.duplicates = nil
   578  	b.capacityChangeCh = make(chan *capacityUpdate, unblockBuffer)
   579  	b.stopCh = make(chan struct{})
   580  	b.duplicateCh = make(chan struct{}, 1)
   581  }
   582  
   583  // Stats is used to query the state of the blocked eval tracker.
   584  func (b *BlockedEvals) Stats() *BlockedStats {
   585  	// Allocate a new stats struct
   586  	stats := new(BlockedStats)
   587  
   588  	b.l.RLock()
   589  	defer b.l.RUnlock()
   590  
   591  	// Copy all the stats
   592  	stats.TotalEscaped = b.stats.TotalEscaped
   593  	stats.TotalBlocked = b.stats.TotalBlocked
   594  	stats.TotalQuotaLimit = b.stats.TotalQuotaLimit
   595  	return stats
   596  }
   597  
   598  // EmitStats is used to export metrics about the blocked eval tracker while enabled
   599  func (b *BlockedEvals) EmitStats(period time.Duration, stopCh chan struct{}) {
   600  	for {
   601  		select {
   602  		case <-time.After(period):
   603  			stats := b.Stats()
   604  			metrics.SetGauge([]string{"nomad", "blocked_evals", "total_quota_limit"}, float32(stats.TotalQuotaLimit))
   605  			metrics.SetGauge([]string{"nomad", "blocked_evals", "total_blocked"}, float32(stats.TotalBlocked))
   606  			metrics.SetGauge([]string{"nomad", "blocked_evals", "total_escaped"}, float32(stats.TotalEscaped))
   607  		case <-stopCh:
   608  			return
   609  		}
   610  	}
   611  }
   612  
   613  // prune is a long lived function that prunes unnecessary objects on a timer.
   614  func (b *BlockedEvals) prune() {
   615  	ticker := time.NewTicker(pruneInterval)
   616  	defer ticker.Stop()
   617  
   618  	for {
   619  		select {
   620  		case <-b.stopCh:
   621  			return
   622  		case <-ticker.C:
   623  			b.pruneUnblockIndexes()
   624  		}
   625  	}
   626  }
   627  
   628  // pruneUnblockIndexes is used to prune any tracked entry that is excessively
   629  // old. This protects againsts unbounded growth of the map.
   630  func (b *BlockedEvals) pruneUnblockIndexes() {
   631  	b.l.Lock()
   632  	defer b.l.Unlock()
   633  
   634  	if b.timetable == nil {
   635  		return
   636  	}
   637  
   638  	cutoff := time.Now().UTC().Add(-1 * pruneThreshold)
   639  	oldThreshold := b.timetable.NearestIndex(cutoff)
   640  
   641  	for key, index := range b.unblockIndexes {
   642  		if index < oldThreshold {
   643  			delete(b.unblockIndexes, key)
   644  		}
   645  	}
   646  }