github.com/zhizhiboom/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/deploymentwatcher/deployment_watcher.go

github.com/zhizhiboom/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/deploymentwatcher/deployment_watcher.go (about)

     1  package deploymentwatcher
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"sync"
     8  	"time"
     9  
    10  	"golang.org/x/time/rate"
    11  
    12  	memdb "github.com/hashicorp/go-memdb"
    13  	"github.com/hashicorp/nomad/helper"
    14  	"github.com/hashicorp/nomad/helper/uuid"
    15  	"github.com/hashicorp/nomad/nomad/state"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  )
    18  
    19  const (
    20  	// perJobEvalBatchPeriod is the batching length before creating an evaluation to
    21  	// trigger the scheduler when allocations are marked as healthy.
    22  	perJobEvalBatchPeriod = 1 * time.Second
    23  )
    24  
    25  var (
    26  	// allowRescheduleTransition is the transition that allows failed
    27  	// allocations part of a deployment to be rescheduled. We create a one off
    28  	// variable to avoid creating a new object for every request.
    29  	allowRescheduleTransition = &structs.DesiredTransition{
    30  		Reschedule: helper.BoolToPtr(true),
    31  	}
    32  )
    33  
    34  // deploymentTriggers are the set of functions required to trigger changes on
    35  // behalf of a deployment
    36  type deploymentTriggers interface {
    37  	// createUpdate is used to create allocation desired transition updates and
    38  	// an evaluation.
    39  	createUpdate(allocs map[string]*structs.DesiredTransition, eval *structs.Evaluation) (uint64, error)
    40  
    41  	// upsertJob is used to roll back a job when autoreverting for a deployment
    42  	upsertJob(job *structs.Job) (uint64, error)
    43  
    44  	// upsertDeploymentStatusUpdate is used to upsert a deployment status update
    45  	// and an optional evaluation and job to upsert
    46  	upsertDeploymentStatusUpdate(u *structs.DeploymentStatusUpdate, eval *structs.Evaluation, job *structs.Job) (uint64, error)
    47  
    48  	// upsertDeploymentPromotion is used to promote canaries in a deployment
    49  	upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error)
    50  
    51  	// upsertDeploymentAllocHealth is used to set the health of allocations in a
    52  	// deployment
    53  	upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error)
    54  }
    55  
    56  // deploymentWatcher is used to watch a single deployment and trigger the
    57  // scheduler when allocation health transitions.
    58  type deploymentWatcher struct {
    59  	// queryLimiter is used to limit the rate of blocking queries
    60  	queryLimiter *rate.Limiter
    61  
    62  	// deploymentTriggers holds the methods required to trigger changes on behalf of the
    63  	// deployment
    64  	deploymentTriggers
    65  
    66  	// state is the state that is watched for state changes.
    67  	state *state.StateStore
    68  
    69  	// deploymentID is the deployment's ID being watched
    70  	deploymentID string
    71  
    72  	// deploymentUpdateCh is triggered when there is an updated deployment
    73  	deploymentUpdateCh chan struct{}
    74  
    75  	// d is the deployment being watched
    76  	d *structs.Deployment
    77  
    78  	// j is the job the deployment is for
    79  	j *structs.Job
    80  
    81  	// outstandingBatch marks whether an outstanding function exists to create
    82  	// the evaluation. Access should be done through the lock.
    83  	outstandingBatch bool
    84  
    85  	// outstandingAllowReplacements is the map of allocations that will be
    86  	// marked as allowing a replacement. Access should be done through the lock.
    87  	outstandingAllowReplacements map[string]*structs.DesiredTransition
    88  
    89  	// latestEval is the latest eval for the job. It is updated by the watch
    90  	// loop and any time an evaluation is created. The field should be accessed
    91  	// by holding the lock or using the setter and getter methods.
    92  	latestEval uint64
    93  
    94  	logger *log.Logger
    95  	ctx    context.Context
    96  	exitFn context.CancelFunc
    97  	l      sync.RWMutex
    98  }
    99  
   100  // newDeploymentWatcher returns a deployment watcher that is used to watch
   101  // deployments and trigger the scheduler as needed.
   102  func newDeploymentWatcher(parent context.Context, queryLimiter *rate.Limiter,
   103  	logger *log.Logger, state *state.StateStore, d *structs.Deployment,
   104  	j *structs.Job, triggers deploymentTriggers) *deploymentWatcher {
   105  
   106  	ctx, exitFn := context.WithCancel(parent)
   107  	w := &deploymentWatcher{
   108  		queryLimiter:       queryLimiter,
   109  		deploymentID:       d.ID,
   110  		deploymentUpdateCh: make(chan struct{}, 1),
   111  		d:                  d,
   112  		j:                  j,
   113  		state:              state,
   114  		deploymentTriggers: triggers,
   115  		logger:             logger,
   116  		ctx:                ctx,
   117  		exitFn:             exitFn,
   118  	}
   119  
   120  	// Start the long lived watcher that scans for allocation updates
   121  	go w.watch()
   122  
   123  	return w
   124  }
   125  
   126  // updateDeployment is used to update the tracked deployment.
   127  func (w *deploymentWatcher) updateDeployment(d *structs.Deployment) {
   128  	w.l.Lock()
   129  	defer w.l.Unlock()
   130  
   131  	// Update and trigger
   132  	w.d = d
   133  	select {
   134  	case w.deploymentUpdateCh <- struct{}{}:
   135  	default:
   136  	}
   137  }
   138  
   139  // getDeployment returns the tracked deployment.
   140  func (w *deploymentWatcher) getDeployment() *structs.Deployment {
   141  	w.l.RLock()
   142  	defer w.l.RUnlock()
   143  	return w.d
   144  }
   145  
   146  func (w *deploymentWatcher) SetAllocHealth(
   147  	req *structs.DeploymentAllocHealthRequest,
   148  	resp *structs.DeploymentUpdateResponse) error {
   149  
   150  	// If we are failing the deployment, update the status and potentially
   151  	// rollback
   152  	var j *structs.Job
   153  	var u *structs.DeploymentStatusUpdate
   154  
   155  	// If there are unhealthy allocations we need to mark the deployment as
   156  	// failed and check if we should roll back to a stable job.
   157  	if l := len(req.UnhealthyAllocationIDs); l != 0 {
   158  		unhealthy := make(map[string]struct{}, l)
   159  		for _, alloc := range req.UnhealthyAllocationIDs {
   160  			unhealthy[alloc] = struct{}{}
   161  		}
   162  
   163  		// Get the allocations for the deployment
   164  		snap, err := w.state.Snapshot()
   165  		if err != nil {
   166  			return err
   167  		}
   168  
   169  		allocs, err := snap.AllocsByDeployment(nil, req.DeploymentID)
   170  		if err != nil {
   171  			return err
   172  		}
   173  
   174  		// Determine if we should autorevert to an older job
   175  		desc := structs.DeploymentStatusDescriptionFailedAllocations
   176  		for _, alloc := range allocs {
   177  			// Check that the alloc has been marked unhealthy
   178  			if _, ok := unhealthy[alloc.ID]; !ok {
   179  				continue
   180  			}
   181  
   182  			// Check if the group has autorevert set
   183  			group, ok := w.getDeployment().TaskGroups[alloc.TaskGroup]
   184  			if !ok || !group.AutoRevert {
   185  				continue
   186  			}
   187  
   188  			var err error
   189  			j, err = w.latestStableJob()
   190  			if err != nil {
   191  				return err
   192  			}
   193  
   194  			if j != nil {
   195  				j, desc = w.handleRollbackValidity(j, desc)
   196  			}
   197  			break
   198  		}
   199  
   200  		u = w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc)
   201  	}
   202  
   203  	// Canonicalize the job in case it doesn't have namespace set
   204  	j.Canonicalize()
   205  
   206  	// Create the request
   207  	areq := &structs.ApplyDeploymentAllocHealthRequest{
   208  		DeploymentAllocHealthRequest: *req,
   209  		Timestamp:                    time.Now(),
   210  		Eval:                         w.getEval(),
   211  		DeploymentUpdate:             u,
   212  		Job:                          j,
   213  	}
   214  
   215  	index, err := w.upsertDeploymentAllocHealth(areq)
   216  	if err != nil {
   217  		return err
   218  	}
   219  
   220  	// Build the response
   221  	resp.EvalID = areq.Eval.ID
   222  	resp.EvalCreateIndex = index
   223  	resp.DeploymentModifyIndex = index
   224  	resp.Index = index
   225  	if j != nil {
   226  		resp.RevertedJobVersion = helper.Uint64ToPtr(j.Version)
   227  	}
   228  	w.setLatestEval(index)
   229  	return nil
   230  }
   231  
   232  // handleRollbackValidity checks if the job being rolled back to has the same spec as the existing job
   233  // Returns a modified description and job accordingly.
   234  func (w *deploymentWatcher) handleRollbackValidity(rollbackJob *structs.Job, desc string) (*structs.Job, string) {
   235  	// Only rollback if job being changed has a different spec.
   236  	// This prevents an infinite revert cycle when a previously stable version of the job fails to start up during a rollback
   237  	// If the job we are trying to rollback to is identical to the current job, we stop because the rollback will not succeed.
   238  	if w.j.SpecChanged(rollbackJob) {
   239  		desc = structs.DeploymentStatusDescriptionRollback(desc, rollbackJob.Version)
   240  	} else {
   241  		desc = structs.DeploymentStatusDescriptionRollbackNoop(desc, rollbackJob.Version)
   242  		rollbackJob = nil
   243  	}
   244  	return rollbackJob, desc
   245  }
   246  
   247  func (w *deploymentWatcher) PromoteDeployment(
   248  	req *structs.DeploymentPromoteRequest,
   249  	resp *structs.DeploymentUpdateResponse) error {
   250  
   251  	// Create the request
   252  	areq := &structs.ApplyDeploymentPromoteRequest{
   253  		DeploymentPromoteRequest: *req,
   254  		Eval:                     w.getEval(),
   255  	}
   256  
   257  	index, err := w.upsertDeploymentPromotion(areq)
   258  	if err != nil {
   259  		return err
   260  	}
   261  
   262  	// Build the response
   263  	resp.EvalID = areq.Eval.ID
   264  	resp.EvalCreateIndex = index
   265  	resp.DeploymentModifyIndex = index
   266  	resp.Index = index
   267  	w.setLatestEval(index)
   268  	return nil
   269  }
   270  
   271  func (w *deploymentWatcher) PauseDeployment(
   272  	req *structs.DeploymentPauseRequest,
   273  	resp *structs.DeploymentUpdateResponse) error {
   274  	// Determine the status we should transition to and if we need to create an
   275  	// evaluation
   276  	status, desc := structs.DeploymentStatusPaused, structs.DeploymentStatusDescriptionPaused
   277  	var eval *structs.Evaluation
   278  	evalID := ""
   279  	if !req.Pause {
   280  		status, desc = structs.DeploymentStatusRunning, structs.DeploymentStatusDescriptionRunning
   281  		eval = w.getEval()
   282  		evalID = eval.ID
   283  	}
   284  	update := w.getDeploymentStatusUpdate(status, desc)
   285  
   286  	// Commit the change
   287  	i, err := w.upsertDeploymentStatusUpdate(update, eval, nil)
   288  	if err != nil {
   289  		return err
   290  	}
   291  
   292  	// Build the response
   293  	if evalID != "" {
   294  		resp.EvalID = evalID
   295  		resp.EvalCreateIndex = i
   296  	}
   297  	resp.DeploymentModifyIndex = i
   298  	resp.Index = i
   299  	w.setLatestEval(i)
   300  	return nil
   301  }
   302  
   303  func (w *deploymentWatcher) FailDeployment(
   304  	req *structs.DeploymentFailRequest,
   305  	resp *structs.DeploymentUpdateResponse) error {
   306  
   307  	status, desc := structs.DeploymentStatusFailed, structs.DeploymentStatusDescriptionFailedByUser
   308  
   309  	// Determine if we should rollback
   310  	rollback := false
   311  	for _, state := range w.getDeployment().TaskGroups {
   312  		if state.AutoRevert {
   313  			rollback = true
   314  			break
   315  		}
   316  	}
   317  
   318  	var rollbackJob *structs.Job
   319  	if rollback {
   320  		var err error
   321  		rollbackJob, err = w.latestStableJob()
   322  		if err != nil {
   323  			return err
   324  		}
   325  
   326  		if rollbackJob != nil {
   327  			rollbackJob, desc = w.handleRollbackValidity(rollbackJob, desc)
   328  		} else {
   329  			desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc)
   330  		}
   331  	}
   332  
   333  	// Commit the change
   334  	update := w.getDeploymentStatusUpdate(status, desc)
   335  	eval := w.getEval()
   336  	i, err := w.upsertDeploymentStatusUpdate(update, eval, rollbackJob)
   337  	if err != nil {
   338  		return err
   339  	}
   340  
   341  	// Build the response
   342  	resp.EvalID = eval.ID
   343  	resp.EvalCreateIndex = i
   344  	resp.DeploymentModifyIndex = i
   345  	resp.Index = i
   346  	if rollbackJob != nil {
   347  		resp.RevertedJobVersion = helper.Uint64ToPtr(rollbackJob.Version)
   348  	}
   349  	w.setLatestEval(i)
   350  	return nil
   351  }
   352  
   353  // StopWatch stops watching the deployment. This should be called whenever a
   354  // deployment is completed or the watcher is no longer needed.
   355  func (w *deploymentWatcher) StopWatch() {
   356  	w.exitFn()
   357  }
   358  
   359  // watch is the long running watcher that watches for both allocation and
   360  // deployment changes. Its function is to create evaluations to trigger the
   361  // scheduler when more progress can be made, to fail the deployment if it has
   362  // failed and potentially rolling back the job. Progress can be made when an
   363  // allocation transitions to healthy, so we create an eval.
   364  func (w *deploymentWatcher) watch() {
   365  	// Get the deadline. This is likely a zero time to begin with but we need to
   366  	// handle the case that the deployment has already progressed and we are now
   367  	// just starting to watch it. This must likely would occur if there was a
   368  	// leader transition and we are now starting our watcher.
   369  	currentDeadline := getDeploymentProgressCutoff(w.getDeployment())
   370  	var deadlineTimer *time.Timer
   371  	if currentDeadline.IsZero() {
   372  		deadlineTimer = time.NewTimer(0)
   373  		if !deadlineTimer.Stop() {
   374  			<-deadlineTimer.C
   375  		}
   376  	} else {
   377  		deadlineTimer = time.NewTimer(currentDeadline.Sub(time.Now()))
   378  	}
   379  
   380  	allocIndex := uint64(1)
   381  	var updates *allocUpdates
   382  
   383  	rollback, deadlineHit := false, false
   384  
   385  FAIL:
   386  	for {
   387  		select {
   388  		case <-w.ctx.Done():
   389  			return
   390  		case <-deadlineTimer.C:
   391  			// We have hit the progress deadline so fail the deployment. We need
   392  			// to determine whether we should roll back the job by inspecting
   393  			// which allocs as part of the deployment are healthy and which
   394  			// aren't.
   395  			deadlineHit = true
   396  			fail, rback, err := w.shouldFail()
   397  			if err != nil {
   398  				w.logger.Printf("[ERR] nomad.deployment_watcher: failed to determine whether to rollback job for deployment %q: %v", w.deploymentID, err)
   399  			}
   400  			if !fail {
   401  				w.logger.Printf("[DEBUG] nomad.deployment_watcher: skipping deadline for deployment %q", w.deploymentID)
   402  				continue
   403  			}
   404  
   405  			w.logger.Printf("[DEBUG] nomad.deployment_watcher: deadline for deployment %q hit and rollback is %v", w.deploymentID, rback)
   406  			rollback = rback
   407  			break FAIL
   408  		case <-w.deploymentUpdateCh:
   409  			// Get the updated deployment and check if we should change the
   410  			// deadline timer
   411  			next := getDeploymentProgressCutoff(w.getDeployment())
   412  			if !next.Equal(currentDeadline) {
   413  				prevDeadlineZero := currentDeadline.IsZero()
   414  				currentDeadline = next
   415  				// The most recent deadline can be zero if no allocs were created for this deployment.
   416  				// The deadline timer would have already been stopped once in that case. To prevent
   417  				// deadlocking on the already stopped deadline timer, we only drain the channel if
   418  				// the previous deadline was not zero.
   419  				if !prevDeadlineZero && !deadlineTimer.Stop() {
   420  					select {
   421  					case <-deadlineTimer.C:
   422  					default:
   423  					}
   424  				}
   425  				deadlineTimer.Reset(next.Sub(time.Now()))
   426  			}
   427  
   428  		case updates = <-w.getAllocsCh(allocIndex):
   429  			if err := updates.err; err != nil {
   430  				if err == context.Canceled || w.ctx.Err() == context.Canceled {
   431  					return
   432  				}
   433  
   434  				w.logger.Printf("[ERR] nomad.deployment_watcher: failed to retrieve allocations for deployment %q: %v", w.deploymentID, err)
   435  				return
   436  			}
   437  			allocIndex = updates.index
   438  
   439  			// We have allocation changes for this deployment so determine the
   440  			// steps to take.
   441  			res, err := w.handleAllocUpdate(updates.allocs)
   442  			if err != nil {
   443  				if err == context.Canceled || w.ctx.Err() == context.Canceled {
   444  					return
   445  				}
   446  
   447  				w.logger.Printf("[ERR] nomad.deployment_watcher: failed handling allocation updates: %v", err)
   448  				return
   449  			}
   450  
   451  			// The deployment has failed, so break out of the watch loop and
   452  			// handle the failure
   453  			if res.failDeployment {
   454  				rollback = res.rollback
   455  				break FAIL
   456  			}
   457  
   458  			// Create an eval to push the deployment along
   459  			if res.createEval || len(res.allowReplacements) != 0 {
   460  				w.createBatchedUpdate(res.allowReplacements, allocIndex)
   461  			}
   462  		}
   463  	}
   464  
   465  	// Change the deployments status to failed
   466  	desc := structs.DeploymentStatusDescriptionFailedAllocations
   467  	if deadlineHit {
   468  		desc = structs.DeploymentStatusDescriptionProgressDeadline
   469  	}
   470  
   471  	// Rollback to the old job if necessary
   472  	var j *structs.Job
   473  	if rollback {
   474  		var err error
   475  		j, err = w.latestStableJob()
   476  		if err != nil {
   477  			w.logger.Printf("[ERR] nomad.deployment_watcher: failed to lookup latest stable job for %q: %v", w.j.ID, err)
   478  		}
   479  
   480  		// Description should include that the job is being rolled back to
   481  		// version N
   482  		if j != nil {
   483  			j, desc = w.handleRollbackValidity(j, desc)
   484  		} else {
   485  			desc = structs.DeploymentStatusDescriptionNoRollbackTarget(desc)
   486  		}
   487  	}
   488  
   489  	// Update the status of the deployment to failed and create an evaluation.
   490  	e := w.getEval()
   491  	u := w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc)
   492  	if index, err := w.upsertDeploymentStatusUpdate(u, e, j); err != nil {
   493  		w.logger.Printf("[ERR] nomad.deployment_watcher: failed to update deployment %q status: %v", w.deploymentID, err)
   494  	} else {
   495  		w.setLatestEval(index)
   496  	}
   497  }
   498  
   499  // allocUpdateResult is used to return the desired actions given the newest set
   500  // of allocations for the deployment.
   501  type allocUpdateResult struct {
   502  	createEval        bool
   503  	failDeployment    bool
   504  	rollback          bool
   505  	allowReplacements []string
   506  }
   507  
   508  // handleAllocUpdate is used to compute the set of actions to take based on the
   509  // updated allocations for the deployment.
   510  func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) (allocUpdateResult, error) {
   511  	var res allocUpdateResult
   512  
   513  	// Get the latest evaluation index
   514  	latestEval, err := w.latestEvalIndex()
   515  	if err != nil {
   516  		if err == context.Canceled || w.ctx.Err() == context.Canceled {
   517  			return res, err
   518  		}
   519  
   520  		return res, fmt.Errorf("failed to determine last evaluation index for job %q: %v", w.j.ID, err)
   521  	}
   522  
   523  	deployment := w.getDeployment()
   524  	for _, alloc := range allocs {
   525  		dstate, ok := deployment.TaskGroups[alloc.TaskGroup]
   526  		if !ok {
   527  			continue
   528  		}
   529  
   530  		// Nothing to do for this allocation
   531  		if alloc.DeploymentStatus == nil || alloc.DeploymentStatus.ModifyIndex <= latestEval {
   532  			continue
   533  		}
   534  
   535  		// Determine if the update stanza for this group is progress based
   536  		progressBased := dstate.ProgressDeadline != 0
   537  
   538  		// We need to create an eval so the job can progress.
   539  		if alloc.DeploymentStatus.IsHealthy() {
   540  			res.createEval = true
   541  		} else if progressBased && alloc.DeploymentStatus.IsUnhealthy() && deployment.Active() && !alloc.DesiredTransition.ShouldReschedule() {
   542  			res.allowReplacements = append(res.allowReplacements, alloc.ID)
   543  		}
   544  
   545  		// If the group is using a progress deadline, we don't have to do anything.
   546  		if progressBased {
   547  			continue
   548  		}
   549  
   550  		// Fail on the first bad allocation
   551  		if alloc.DeploymentStatus.IsUnhealthy() {
   552  			// Check if the group has autorevert set
   553  			if dstate.AutoRevert {
   554  				res.rollback = true
   555  			}
   556  
   557  			// Since we have an unhealthy allocation, fail the deployment
   558  			res.failDeployment = true
   559  		}
   560  
   561  		// All conditions have been hit so we can break
   562  		if res.createEval && res.failDeployment && res.rollback {
   563  			break
   564  		}
   565  	}
   566  
   567  	return res, nil
   568  }
   569  
   570  // shouldFail returns whether the job should be failed and whether it should
   571  // rolled back to an earlier stable version by examining the allocations in the
   572  // deployment.
   573  func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) {
   574  	snap, err := w.state.Snapshot()
   575  	if err != nil {
   576  		return false, false, err
   577  	}
   578  
   579  	d, err := snap.DeploymentByID(nil, w.deploymentID)
   580  	if err != nil {
   581  		return false, false, err
   582  	}
   583  	if d == nil {
   584  		// The deployment wasn't in the state store, possibly due to a system gc
   585  		return false, false, fmt.Errorf("deployment id not found: %q", w.deploymentID)
   586  	}
   587  
   588  	fail = false
   589  	for tg, state := range d.TaskGroups {
   590  		// If we are in a canary state we fail if there aren't enough healthy
   591  		// allocs to satisfy DesiredCanaries
   592  		if state.DesiredCanaries > 0 && !state.Promoted {
   593  			if state.HealthyAllocs >= state.DesiredCanaries {
   594  				continue
   595  			}
   596  		} else if state.HealthyAllocs >= state.DesiredTotal {
   597  			continue
   598  		}
   599  
   600  		// We have failed this TG
   601  		fail = true
   602  
   603  		// We don't need to autorevert this group
   604  		upd := w.j.LookupTaskGroup(tg).Update
   605  		if upd == nil || !upd.AutoRevert {
   606  			continue
   607  		}
   608  
   609  		// Unhealthy allocs and we need to autorevert
   610  		return true, true, nil
   611  	}
   612  
   613  	return fail, false, nil
   614  }
   615  
   616  // getDeploymentProgressCutoff returns the progress cutoff for the given
   617  // deployment
   618  func getDeploymentProgressCutoff(d *structs.Deployment) time.Time {
   619  	var next time.Time
   620  	for _, state := range d.TaskGroups {
   621  		if next.IsZero() || state.RequireProgressBy.Before(next) {
   622  			next = state.RequireProgressBy
   623  		}
   624  	}
   625  	return next
   626  }
   627  
   628  // latestStableJob returns the latest stable job. It may be nil if none exist
   629  func (w *deploymentWatcher) latestStableJob() (*structs.Job, error) {
   630  	snap, err := w.state.Snapshot()
   631  	if err != nil {
   632  		return nil, err
   633  	}
   634  
   635  	versions, err := snap.JobVersionsByID(nil, w.j.Namespace, w.j.ID)
   636  	if err != nil {
   637  		return nil, err
   638  	}
   639  
   640  	var stable *structs.Job
   641  	for _, job := range versions {
   642  		if job.Stable {
   643  			stable = job
   644  			break
   645  		}
   646  	}
   647  
   648  	return stable, nil
   649  }
   650  
   651  // createBatchedUpdate creates an eval for the given index as well as updating
   652  // the given allocations to allow them to reschedule.
   653  func (w *deploymentWatcher) createBatchedUpdate(allowReplacements []string, forIndex uint64) {
   654  	w.l.Lock()
   655  	defer w.l.Unlock()
   656  
   657  	// Store the allocations that can be replaced
   658  	for _, allocID := range allowReplacements {
   659  		if w.outstandingAllowReplacements == nil {
   660  			w.outstandingAllowReplacements = make(map[string]*structs.DesiredTransition, len(allowReplacements))
   661  		}
   662  		w.outstandingAllowReplacements[allocID] = allowRescheduleTransition
   663  	}
   664  
   665  	if w.outstandingBatch || (forIndex < w.latestEval && len(allowReplacements) == 0) {
   666  		return
   667  	}
   668  
   669  	w.outstandingBatch = true
   670  
   671  	time.AfterFunc(perJobEvalBatchPeriod, func() {
   672  		// If the timer has been created and then we shutdown, we need to no-op
   673  		// the evaluation creation.
   674  		select {
   675  		case <-w.ctx.Done():
   676  			return
   677  		default:
   678  		}
   679  
   680  		w.l.Lock()
   681  		replacements := w.outstandingAllowReplacements
   682  		w.outstandingAllowReplacements = nil
   683  		w.outstandingBatch = false
   684  		w.l.Unlock()
   685  
   686  		// Create the eval
   687  		if index, err := w.createUpdate(replacements, w.getEval()); err != nil {
   688  			w.logger.Printf("[ERR] nomad.deployment_watcher: failed to create evaluation for deployment %q: %v", w.deploymentID, err)
   689  		} else {
   690  			w.setLatestEval(index)
   691  		}
   692  	})
   693  }
   694  
   695  // getEval returns an evaluation suitable for the deployment
   696  func (w *deploymentWatcher) getEval() *structs.Evaluation {
   697  	return &structs.Evaluation{
   698  		ID:           uuid.Generate(),
   699  		Namespace:    w.j.Namespace,
   700  		Priority:     w.j.Priority,
   701  		Type:         w.j.Type,
   702  		TriggeredBy:  structs.EvalTriggerDeploymentWatcher,
   703  		JobID:        w.j.ID,
   704  		DeploymentID: w.deploymentID,
   705  		Status:       structs.EvalStatusPending,
   706  	}
   707  }
   708  
   709  // getDeploymentStatusUpdate returns a deployment status update
   710  func (w *deploymentWatcher) getDeploymentStatusUpdate(status, desc string) *structs.DeploymentStatusUpdate {
   711  	return &structs.DeploymentStatusUpdate{
   712  		DeploymentID:      w.deploymentID,
   713  		Status:            status,
   714  		StatusDescription: desc,
   715  	}
   716  }
   717  
   718  type allocUpdates struct {
   719  	allocs []*structs.AllocListStub
   720  	index  uint64
   721  	err    error
   722  }
   723  
   724  // getAllocsCh retrieves the allocations that are part of the deployment blocking
   725  // at the given index.
   726  func (w *deploymentWatcher) getAllocsCh(index uint64) <-chan *allocUpdates {
   727  	out := make(chan *allocUpdates, 1)
   728  	go func() {
   729  		allocs, index, err := w.getAllocs(index)
   730  		out <- &allocUpdates{
   731  			allocs: allocs,
   732  			index:  index,
   733  			err:    err,
   734  		}
   735  	}()
   736  
   737  	return out
   738  }
   739  
   740  // getAllocs retrieves the allocations that are part of the deployment blocking
   741  // at the given index.
   742  func (w *deploymentWatcher) getAllocs(index uint64) ([]*structs.AllocListStub, uint64, error) {
   743  	resp, index, err := w.state.BlockingQuery(w.getAllocsImpl, index, w.ctx)
   744  	if err != nil {
   745  		return nil, 0, err
   746  	}
   747  	if err := w.ctx.Err(); err != nil {
   748  		return nil, 0, err
   749  	}
   750  
   751  	return resp.([]*structs.AllocListStub), index, nil
   752  }
   753  
   754  // getDeploysImpl retrieves all deployments from the passed state store.
   755  func (w *deploymentWatcher) getAllocsImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   756  	if err := w.queryLimiter.Wait(w.ctx); err != nil {
   757  		return nil, 0, err
   758  	}
   759  
   760  	// Capture all the allocations
   761  	allocs, err := state.AllocsByDeployment(ws, w.deploymentID)
   762  	if err != nil {
   763  		return nil, 0, err
   764  	}
   765  
   766  	stubs := make([]*structs.AllocListStub, 0, len(allocs))
   767  	for _, alloc := range allocs {
   768  		stubs = append(stubs, alloc.Stub())
   769  	}
   770  
   771  	// Use the last index that affected the jobs table
   772  	index, err := state.Index("allocs")
   773  	if err != nil {
   774  		return nil, index, err
   775  	}
   776  
   777  	return stubs, index, nil
   778  }
   779  
   780  // latestEvalIndex returns the index of the last evaluation created for
   781  // the job. The index is used to determine if an allocation update requires an
   782  // evaluation to be triggered.
   783  func (w *deploymentWatcher) latestEvalIndex() (uint64, error) {
   784  	if err := w.queryLimiter.Wait(w.ctx); err != nil {
   785  		return 0, err
   786  	}
   787  
   788  	snap, err := w.state.Snapshot()
   789  	if err != nil {
   790  		return 0, err
   791  	}
   792  
   793  	evals, err := snap.EvalsByJob(nil, w.j.Namespace, w.j.ID)
   794  	if err != nil {
   795  		return 0, err
   796  	}
   797  
   798  	if len(evals) == 0 {
   799  		idx, err := snap.Index("evals")
   800  		if err != nil {
   801  			w.setLatestEval(idx)
   802  		}
   803  
   804  		return idx, err
   805  	}
   806  
   807  	// Prefer using the snapshot index. Otherwise use the create index
   808  	e := evals[0]
   809  	if e.SnapshotIndex != 0 {
   810  		w.setLatestEval(e.SnapshotIndex)
   811  		return e.SnapshotIndex, nil
   812  	}
   813  
   814  	w.setLatestEval(e.CreateIndex)
   815  	return e.CreateIndex, nil
   816  }
   817  
   818  // setLatestEval sets the given index as the latest eval unless the currently
   819  // stored index is higher.
   820  func (w *deploymentWatcher) setLatestEval(index uint64) {
   821  	w.l.Lock()
   822  	defer w.l.Unlock()
   823  	if index > w.latestEval {
   824  		w.latestEval = index
   825  	}
   826  }
   827  
   828  // getLatestEval returns the latest eval index.
   829  func (w *deploymentWatcher) getLatestEval() uint64 {
   830  	w.l.Lock()
   831  	defer w.l.Unlock()
   832  	return w.latestEval
   833  }