github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/nomad/deploymentwatcher/deployment_watcher.go

github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/nomad/deploymentwatcher/deployment_watcher.go (about)

     1  package deploymentwatcher
     2  
     3  import (
     4  	"context"
     5  	"log"
     6  	"sync"
     7  	"time"
     8  
     9  	"golang.org/x/time/rate"
    10  
    11  	"github.com/hashicorp/nomad/helper"
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  )
    14  
    15  const (
    16  	// perJobEvalBatchPeriod is the batching length before creating an evaluation to
    17  	// trigger the scheduler when allocations are marked as healthy.
    18  	perJobEvalBatchPeriod = 1 * time.Second
    19  )
    20  
    21  // deploymentTriggers are the set of functions required to trigger changes on
    22  // behalf of a deployment
    23  type deploymentTriggers interface {
    24  	// createEvaluation is used to create an evaluation.
    25  	createEvaluation(eval *structs.Evaluation) (uint64, error)
    26  
    27  	// upsertJob is used to roll back a job when autoreverting for a deployment
    28  	upsertJob(job *structs.Job) (uint64, error)
    29  
    30  	// upsertDeploymentStatusUpdate is used to upsert a deployment status update
    31  	// and an optional evaluation and job to upsert
    32  	upsertDeploymentStatusUpdate(u *structs.DeploymentStatusUpdate, eval *structs.Evaluation, job *structs.Job) (uint64, error)
    33  
    34  	// upsertDeploymentPromotion is used to promote canaries in a deployment
    35  	upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error)
    36  
    37  	// upsertDeploymentAllocHealth is used to set the health of allocations in a
    38  	// deployment
    39  	upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error)
    40  }
    41  
    42  // deploymentWatcher is used to watch a single deployment and trigger the
    43  // scheduler when allocation health transistions.
    44  type deploymentWatcher struct {
    45  	// queryLimiter is used to limit the rate of blocking queries
    46  	queryLimiter *rate.Limiter
    47  
    48  	// deploymentTriggers holds the methods required to trigger changes on behalf of the
    49  	// deployment
    50  	deploymentTriggers
    51  
    52  	// DeploymentStateWatchers holds the methods required to watch objects for
    53  	// changes on behalf of the deployment
    54  	watchers DeploymentStateWatchers
    55  
    56  	// d is the deployment being watched
    57  	d *structs.Deployment
    58  
    59  	// j is the job the deployment is for
    60  	j *structs.Job
    61  
    62  	// outstandingBatch marks whether an outstanding function exists to create
    63  	// the evaluation. Access should be done through the lock
    64  	outstandingBatch bool
    65  
    66  	// latestEval is the latest eval for the job. It is updated by the watch
    67  	// loop and any time an evaluation is created. The field should be accessed
    68  	// by holding the lock or using the setter and getter methods.
    69  	latestEval uint64
    70  
    71  	logger *log.Logger
    72  	ctx    context.Context
    73  	exitFn context.CancelFunc
    74  	l      sync.RWMutex
    75  }
    76  
    77  // newDeploymentWatcher returns a deployment watcher that is used to watch
    78  // deployments and trigger the scheduler as needed.
    79  func newDeploymentWatcher(parent context.Context, queryLimiter *rate.Limiter,
    80  	logger *log.Logger, watchers DeploymentStateWatchers, d *structs.Deployment,
    81  	j *structs.Job, triggers deploymentTriggers) *deploymentWatcher {
    82  
    83  	ctx, exitFn := context.WithCancel(parent)
    84  	w := &deploymentWatcher{
    85  		queryLimiter:       queryLimiter,
    86  		d:                  d,
    87  		j:                  j,
    88  		watchers:           watchers,
    89  		deploymentTriggers: triggers,
    90  		logger:             logger,
    91  		ctx:                ctx,
    92  		exitFn:             exitFn,
    93  	}
    94  
    95  	// Start the long lived watcher that scans for allocation updates
    96  	go w.watch()
    97  
    98  	return w
    99  }
   100  
   101  func (w *deploymentWatcher) SetAllocHealth(
   102  	req *structs.DeploymentAllocHealthRequest,
   103  	resp *structs.DeploymentUpdateResponse) error {
   104  
   105  	// If we are failing the deployment, update the status and potentially
   106  	// rollback
   107  	var j *structs.Job
   108  	var u *structs.DeploymentStatusUpdate
   109  
   110  	// If there are unhealthy allocations we need to mark the deployment as
   111  	// failed and check if we should roll back to a stable job.
   112  	if l := len(req.UnhealthyAllocationIDs); l != 0 {
   113  		unhealthy := make(map[string]struct{}, l)
   114  		for _, alloc := range req.UnhealthyAllocationIDs {
   115  			unhealthy[alloc] = struct{}{}
   116  		}
   117  
   118  		// Get the allocations for the deployment
   119  		args := &structs.DeploymentSpecificRequest{DeploymentID: req.DeploymentID}
   120  		var resp structs.AllocListResponse
   121  		if err := w.watchers.Allocations(args, &resp); err != nil {
   122  			return err
   123  		}
   124  
   125  		// Determine if we should autorevert to an older job
   126  		desc := structs.DeploymentStatusDescriptionFailedAllocations
   127  		for _, alloc := range resp.Allocations {
   128  			// Check that the alloc has been marked unhealthy
   129  			if _, ok := unhealthy[alloc.ID]; !ok {
   130  				continue
   131  			}
   132  
   133  			// Check if the group has autorevert set
   134  			group, ok := w.d.TaskGroups[alloc.TaskGroup]
   135  			if !ok || !group.AutoRevert {
   136  				continue
   137  			}
   138  
   139  			var err error
   140  			j, err = w.latestStableJob()
   141  			if err != nil {
   142  				return err
   143  			}
   144  
   145  			if j != nil {
   146  				desc = structs.DeploymentStatusDescriptionRollback(desc, j.Version)
   147  			}
   148  			break
   149  		}
   150  
   151  		u = w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc)
   152  	}
   153  
   154  	// Create the request
   155  	areq := &structs.ApplyDeploymentAllocHealthRequest{
   156  		DeploymentAllocHealthRequest: *req,
   157  		Eval:             w.getEval(),
   158  		DeploymentUpdate: u,
   159  		Job:              j,
   160  	}
   161  
   162  	index, err := w.upsertDeploymentAllocHealth(areq)
   163  	if err != nil {
   164  		return err
   165  	}
   166  
   167  	// Build the response
   168  	resp.EvalID = areq.Eval.ID
   169  	resp.EvalCreateIndex = index
   170  	resp.DeploymentModifyIndex = index
   171  	resp.Index = index
   172  	if j != nil {
   173  		resp.RevertedJobVersion = helper.Uint64ToPtr(j.Version)
   174  	}
   175  	w.setLatestEval(index)
   176  	return nil
   177  }
   178  
   179  func (w *deploymentWatcher) PromoteDeployment(
   180  	req *structs.DeploymentPromoteRequest,
   181  	resp *structs.DeploymentUpdateResponse) error {
   182  
   183  	// Create the request
   184  	areq := &structs.ApplyDeploymentPromoteRequest{
   185  		DeploymentPromoteRequest: *req,
   186  		Eval: w.getEval(),
   187  	}
   188  
   189  	index, err := w.upsertDeploymentPromotion(areq)
   190  	if err != nil {
   191  		return err
   192  	}
   193  
   194  	// Build the response
   195  	resp.EvalID = areq.Eval.ID
   196  	resp.EvalCreateIndex = index
   197  	resp.DeploymentModifyIndex = index
   198  	resp.Index = index
   199  	w.setLatestEval(index)
   200  	return nil
   201  }
   202  
   203  func (w *deploymentWatcher) PauseDeployment(
   204  	req *structs.DeploymentPauseRequest,
   205  	resp *structs.DeploymentUpdateResponse) error {
   206  	// Determine the status we should transistion to and if we need to create an
   207  	// evaluation
   208  	status, desc := structs.DeploymentStatusPaused, structs.DeploymentStatusDescriptionPaused
   209  	var eval *structs.Evaluation
   210  	evalID := ""
   211  	if !req.Pause {
   212  		status, desc = structs.DeploymentStatusRunning, structs.DeploymentStatusDescriptionRunning
   213  		eval = w.getEval()
   214  		evalID = eval.ID
   215  	}
   216  	update := w.getDeploymentStatusUpdate(status, desc)
   217  
   218  	// Commit the change
   219  	i, err := w.upsertDeploymentStatusUpdate(update, eval, nil)
   220  	if err != nil {
   221  		return err
   222  	}
   223  
   224  	// Build the response
   225  	if evalID != "" {
   226  		resp.EvalID = evalID
   227  		resp.EvalCreateIndex = i
   228  	}
   229  	resp.DeploymentModifyIndex = i
   230  	resp.Index = i
   231  	w.setLatestEval(i)
   232  	return nil
   233  }
   234  
   235  func (w *deploymentWatcher) FailDeployment(
   236  	req *structs.DeploymentFailRequest,
   237  	resp *structs.DeploymentUpdateResponse) error {
   238  
   239  	status, desc := structs.DeploymentStatusFailed, structs.DeploymentStatusDescriptionFailedByUser
   240  
   241  	// Determine if we should rollback
   242  	rollback := false
   243  	for _, state := range w.d.TaskGroups {
   244  		if state.AutoRevert {
   245  			rollback = true
   246  			break
   247  		}
   248  	}
   249  
   250  	var rollbackJob *structs.Job
   251  	if rollback {
   252  		var err error
   253  		rollbackJob, err = w.latestStableJob()
   254  		if err != nil {
   255  			return err
   256  		}
   257  
   258  		if rollbackJob != nil {
   259  			desc = structs.DeploymentStatusDescriptionRollback(desc, rollbackJob.Version)
   260  		}
   261  	}
   262  
   263  	// Commit the change
   264  	update := w.getDeploymentStatusUpdate(status, desc)
   265  	eval := w.getEval()
   266  	i, err := w.upsertDeploymentStatusUpdate(update, eval, rollbackJob)
   267  	if err != nil {
   268  		return err
   269  	}
   270  
   271  	// Build the response
   272  	resp.EvalID = eval.ID
   273  	resp.EvalCreateIndex = i
   274  	resp.DeploymentModifyIndex = i
   275  	resp.Index = i
   276  	if rollbackJob != nil {
   277  		resp.RevertedJobVersion = helper.Uint64ToPtr(rollbackJob.Version)
   278  	}
   279  	w.setLatestEval(i)
   280  	return nil
   281  }
   282  
   283  // StopWatch stops watching the deployment. This should be called whenever a
   284  // deployment is completed or the watcher is no longer needed.
   285  func (w *deploymentWatcher) StopWatch() {
   286  	w.exitFn()
   287  }
   288  
   289  // watch is the long running watcher that takes actions upon allocation changes
   290  func (w *deploymentWatcher) watch() {
   291  	allocIndex := uint64(1)
   292  	for {
   293  		// Block getting all allocations that are part of the deployment using
   294  		// the last evaluation index. This will have us block waiting for
   295  		// something to change past what the scheduler has evaluated.
   296  		allocResp, err := w.getAllocs(allocIndex)
   297  		if err != nil {
   298  			if err == context.Canceled || w.ctx.Err() == context.Canceled {
   299  				return
   300  			}
   301  
   302  			w.logger.Printf("[ERR] nomad.deployment_watcher: failed to retrieve allocations for deployment %q: %v", w.d.ID, err)
   303  			return
   304  		}
   305  		allocIndex = allocResp.Index
   306  
   307  		// Get the latest evaluation index
   308  		latestEval, err := w.latestEvalIndex()
   309  		if err != nil {
   310  			if err == context.Canceled || w.ctx.Err() == context.Canceled {
   311  				return
   312  			}
   313  
   314  			w.logger.Printf("[ERR] nomad.deployment_watcher: failed to determine last evaluation index for job %q: %v", w.d.JobID, err)
   315  			return
   316  		}
   317  
   318  		// Create an evaluation trigger if there is any allocation whose
   319  		// deployment status has been updated past the latest eval index.
   320  		createEval, failDeployment, rollback := false, false, false
   321  		for _, alloc := range allocResp.Allocations {
   322  			if alloc.DeploymentStatus == nil || alloc.DeploymentStatus.ModifyIndex <= latestEval {
   323  				continue
   324  			}
   325  
   326  			// We need to create an eval
   327  			createEval = true
   328  
   329  			if alloc.DeploymentStatus.IsUnhealthy() {
   330  				// Check if the group has autorevert set
   331  				group, ok := w.d.TaskGroups[alloc.TaskGroup]
   332  				if ok && group.AutoRevert {
   333  					rollback = true
   334  				}
   335  
   336  				// Since we have an unhealthy allocation, fail the deployment
   337  				failDeployment = true
   338  			}
   339  
   340  			// All conditions have been hit so we can break
   341  			if createEval && failDeployment && rollback {
   342  				break
   343  			}
   344  		}
   345  
   346  		// Change the deployments status to failed
   347  		if failDeployment {
   348  			// Default description
   349  			desc := structs.DeploymentStatusDescriptionFailedAllocations
   350  
   351  			// Rollback to the old job if necessary
   352  			var j *structs.Job
   353  			if rollback {
   354  				var err error
   355  				j, err = w.latestStableJob()
   356  				if err != nil {
   357  					w.logger.Printf("[ERR] nomad.deployment_watcher: failed to lookup latest stable job for %q: %v", w.d.JobID, err)
   358  				}
   359  
   360  				// Description should include that the job is being rolled back to
   361  				// version N
   362  				if j != nil {
   363  					desc = structs.DeploymentStatusDescriptionRollback(desc, j.Version)
   364  				}
   365  			}
   366  
   367  			// Update the status of the deployment to failed and create an
   368  			// evaluation.
   369  			e := w.getEval()
   370  			u := w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc)
   371  			if index, err := w.upsertDeploymentStatusUpdate(u, e, j); err != nil {
   372  				w.logger.Printf("[ERR] nomad.deployment_watcher: failed to update deployment %q status: %v", w.d.ID, err)
   373  			} else {
   374  				w.setLatestEval(index)
   375  			}
   376  		} else if createEval {
   377  			// Create an eval to push the deployment along
   378  			w.createEvalBatched(allocResp.Index)
   379  		}
   380  	}
   381  }
   382  
   383  // latestStableJob returns the latest stable job. It may be nil if none exist
   384  func (w *deploymentWatcher) latestStableJob() (*structs.Job, error) {
   385  	args := &structs.JobVersionsRequest{JobID: w.d.JobID}
   386  	var resp structs.JobVersionsResponse
   387  	if err := w.watchers.GetJobVersions(args, &resp); err != nil {
   388  		return nil, err
   389  	}
   390  
   391  	var stable *structs.Job
   392  	for _, job := range resp.Versions {
   393  		if job.Stable {
   394  			stable = job
   395  			break
   396  		}
   397  	}
   398  
   399  	return stable, nil
   400  }
   401  
   402  // createEvalBatched creates an eval but batches calls together
   403  func (w *deploymentWatcher) createEvalBatched(forIndex uint64) {
   404  	w.l.Lock()
   405  	defer w.l.Unlock()
   406  
   407  	if w.outstandingBatch || forIndex < w.latestEval {
   408  		return
   409  	}
   410  
   411  	w.outstandingBatch = true
   412  
   413  	time.AfterFunc(perJobEvalBatchPeriod, func() {
   414  		// Create the eval
   415  		evalCreateIndex, err := w.createEvaluation(w.getEval())
   416  		if err != nil {
   417  			w.logger.Printf("[ERR] nomad.deployment_watcher: failed to create evaluation for deployment %q: %v", w.d.ID, err)
   418  		} else {
   419  			w.setLatestEval(evalCreateIndex)
   420  		}
   421  
   422  		w.l.Lock()
   423  		w.outstandingBatch = false
   424  		w.l.Unlock()
   425  
   426  	})
   427  }
   428  
   429  // getEval returns an evaluation suitable for the deployment
   430  func (w *deploymentWatcher) getEval() *structs.Evaluation {
   431  	return &structs.Evaluation{
   432  		ID:           structs.GenerateUUID(),
   433  		Priority:     w.j.Priority,
   434  		Type:         w.j.Type,
   435  		TriggeredBy:  structs.EvalTriggerDeploymentWatcher,
   436  		JobID:        w.j.ID,
   437  		DeploymentID: w.d.ID,
   438  		Status:       structs.EvalStatusPending,
   439  	}
   440  }
   441  
   442  // getDeploymentStatusUpdate returns a deployment status update
   443  func (w *deploymentWatcher) getDeploymentStatusUpdate(status, desc string) *structs.DeploymentStatusUpdate {
   444  	return &structs.DeploymentStatusUpdate{
   445  		DeploymentID:      w.d.ID,
   446  		Status:            status,
   447  		StatusDescription: desc,
   448  	}
   449  }
   450  
   451  // getAllocs retrieves the allocations that are part of the deployment blocking
   452  // at the given index.
   453  func (w *deploymentWatcher) getAllocs(index uint64) (*structs.AllocListResponse, error) {
   454  	// Build the request
   455  	args := &structs.DeploymentSpecificRequest{
   456  		DeploymentID: w.d.ID,
   457  		QueryOptions: structs.QueryOptions{
   458  			MinQueryIndex: index,
   459  		},
   460  	}
   461  	var resp structs.AllocListResponse
   462  
   463  	for resp.Index <= index {
   464  		if err := w.queryLimiter.Wait(w.ctx); err != nil {
   465  			return nil, err
   466  		}
   467  
   468  		if err := w.watchers.Allocations(args, &resp); err != nil {
   469  			return nil, err
   470  		}
   471  	}
   472  
   473  	return &resp, nil
   474  }
   475  
   476  // latestEvalIndex returns the index of the last evaluation created for
   477  // the job. The index is used to determine if an allocation update requires an
   478  // evaluation to be triggered.
   479  func (w *deploymentWatcher) latestEvalIndex() (uint64, error) {
   480  	if err := w.queryLimiter.Wait(w.ctx); err != nil {
   481  		return 0, err
   482  	}
   483  
   484  	args := &structs.JobSpecificRequest{
   485  		JobID: w.d.JobID,
   486  	}
   487  	var resp structs.JobEvaluationsResponse
   488  	err := w.watchers.Evaluations(args, &resp)
   489  	if err != nil {
   490  		return 0, err
   491  	}
   492  
   493  	if len(resp.Evaluations) == 0 {
   494  		w.setLatestEval(resp.Index)
   495  		return resp.Index, nil
   496  	}
   497  
   498  	// Prefer using the snapshot index. Otherwise use the create index
   499  	e := resp.Evaluations[0]
   500  	if e.SnapshotIndex != 0 {
   501  		w.setLatestEval(e.SnapshotIndex)
   502  		return e.SnapshotIndex, nil
   503  	}
   504  
   505  	w.setLatestEval(e.CreateIndex)
   506  	return e.CreateIndex, nil
   507  }
   508  
   509  // setLatestEval sets the given index as the latest eval unless the currently
   510  // stored index is higher.
   511  func (w *deploymentWatcher) setLatestEval(index uint64) {
   512  	w.l.Lock()
   513  	defer w.l.Unlock()
   514  	if index > w.latestEval {
   515  		w.latestEval = index
   516  	}
   517  }
   518  
   519  // getLatestEval returns the latest eval index.
   520  func (w *deploymentWatcher) getLatestEval() uint64 {
   521  	w.l.Lock()
   522  	defer w.l.Unlock()
   523  	return w.latestEval
   524  }