github.com/hhrutter/nomad@v0.6.0-rc2.0.20170723054333-80c4b03f0705/nomad/deploymentwatcher/deployments_watcher.go (about)

     1  package deploymentwatcher
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"sync"
     8  	"time"
     9  
    10  	"golang.org/x/time/rate"
    11  
    12  	"github.com/hashicorp/nomad/nomad/structs"
    13  )
    14  
    15  const (
    16  	// LimitStateQueriesPerSecond is the number of state queries allowed per
    17  	// second
    18  	LimitStateQueriesPerSecond = 100.0
    19  
    20  	// CrossDeploymentEvalBatchDuration is the duration in which evaluations are
    21  	// batched across all deployment watchers before commiting to Raft.
    22  	CrossDeploymentEvalBatchDuration = 250 * time.Millisecond
    23  )
    24  
    25  var (
    26  	// notEnabled is the error returned when the deployment watcher is not
    27  	// enabled
    28  	notEnabled = fmt.Errorf("deployment watcher not enabled")
    29  )
    30  
    31  // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions
    32  // to apply data transforms via Raft.
    33  type DeploymentRaftEndpoints interface {
    34  	// UpsertEvals is used to upsert a set of evaluations
    35  	UpsertEvals([]*structs.Evaluation) (uint64, error)
    36  
    37  	// UpsertJob is used to upsert a job
    38  	UpsertJob(job *structs.Job) (uint64, error)
    39  
    40  	// UpdateDeploymentStatus is used to make a deployment status update
    41  	// and potentially create an evaluation.
    42  	UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error)
    43  
    44  	// UpdateDeploymentPromotion is used to promote canaries in a deployment
    45  	UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error)
    46  
    47  	// UpdateDeploymentAllocHealth is used to set the health of allocations in a
    48  	// deployment
    49  	UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error)
    50  }
    51  
    52  // DeploymentStateWatchers are the set of functions required to watch objects on
    53  // behalf of a deployment
    54  type DeploymentStateWatchers interface {
    55  	// Evaluations returns the set of evaluations for the given job
    56  	Evaluations(args *structs.JobSpecificRequest, reply *structs.JobEvaluationsResponse) error
    57  
    58  	// Allocations returns the set of allocations that are part of the
    59  	// deployment.
    60  	Allocations(args *structs.DeploymentSpecificRequest, reply *structs.AllocListResponse) error
    61  
    62  	// List is used to list all the deployments in the system
    63  	List(args *structs.DeploymentListRequest, reply *structs.DeploymentListResponse) error
    64  
    65  	// GetDeployment is used to lookup a particular deployment.
    66  	GetDeployment(args *structs.DeploymentSpecificRequest, reply *structs.SingleDeploymentResponse) error
    67  
    68  	// GetJobVersions is used to lookup the versions of a job. This is used when
    69  	// rolling back to find the latest stable job
    70  	GetJobVersions(args *structs.JobVersionsRequest, reply *structs.JobVersionsResponse) error
    71  
    72  	// GetJob is used to lookup a particular job.
    73  	GetJob(args *structs.JobSpecificRequest, reply *structs.SingleJobResponse) error
    74  }
    75  
    76  // Watcher is used to watch deployments and their allocations created
    77  // by the scheduler and trigger the scheduler when allocation health
    78  // transistions.
    79  type Watcher struct {
    80  	enabled bool
    81  	logger  *log.Logger
    82  
    83  	// queryLimiter is used to limit the rate of blocking queries
    84  	queryLimiter *rate.Limiter
    85  
    86  	// evalBatchDuration is the duration to batch eval creation across all
    87  	// deployment watchers
    88  	evalBatchDuration time.Duration
    89  
    90  	// raft contains the set of Raft endpoints that can be used by the
    91  	// deployments watcher
    92  	raft DeploymentRaftEndpoints
    93  
    94  	// stateWatchers is the set of functions required to watch a deployment for
    95  	// state changes
    96  	stateWatchers DeploymentStateWatchers
    97  
    98  	// watchers is the set of active watchers, one per deployment
    99  	watchers map[string]*deploymentWatcher
   100  
   101  	// evalBatcher is used to batch the creation of evaluations
   102  	evalBatcher *EvalBatcher
   103  
   104  	// ctx and exitFn are used to cancel the watcher
   105  	ctx    context.Context
   106  	exitFn context.CancelFunc
   107  
   108  	l sync.RWMutex
   109  }
   110  
   111  // NewDeploymentsWatcher returns a deployments watcher that is used to watch
   112  // deployments and trigger the scheduler as needed.
   113  func NewDeploymentsWatcher(logger *log.Logger, watchers DeploymentStateWatchers,
   114  	raft DeploymentRaftEndpoints, stateQueriesPerSecond float64,
   115  	evalBatchDuration time.Duration) *Watcher {
   116  
   117  	return &Watcher{
   118  		stateWatchers:     watchers,
   119  		raft:              raft,
   120  		queryLimiter:      rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100),
   121  		evalBatchDuration: evalBatchDuration,
   122  		logger:            logger,
   123  	}
   124  }
   125  
   126  // SetEnabled is used to control if the watcher is enabled. The watcher
   127  // should only be enabled on the active leader.
   128  func (w *Watcher) SetEnabled(enabled bool) error {
   129  	w.l.Lock()
   130  	defer w.l.Unlock()
   131  
   132  	wasEnabled := w.enabled
   133  	w.enabled = enabled
   134  
   135  	// Flush the state to create the necessary objects
   136  	w.flush()
   137  
   138  	// If we are starting now, launch the watch daemon
   139  	if enabled && !wasEnabled {
   140  		go w.watchDeployments(w.ctx)
   141  	}
   142  
   143  	return nil
   144  }
   145  
   146  // flush is used to clear the state of the watcher
   147  func (w *Watcher) flush() {
   148  	// Stop all the watchers and clear it
   149  	for _, watcher := range w.watchers {
   150  		watcher.StopWatch()
   151  	}
   152  
   153  	// Kill everything associated with the watcher
   154  	if w.exitFn != nil {
   155  		w.exitFn()
   156  	}
   157  
   158  	w.watchers = make(map[string]*deploymentWatcher, 32)
   159  	w.ctx, w.exitFn = context.WithCancel(context.Background())
   160  	w.evalBatcher = NewEvalBatcher(w.evalBatchDuration, w.raft, w.ctx)
   161  }
   162  
   163  // watchDeployments is the long lived go-routine that watches for deployments to
   164  // add and remove watchers on.
   165  func (w *Watcher) watchDeployments(ctx context.Context) {
   166  	dindex := uint64(1)
   167  	for {
   168  		// Block getting all deployments using the last deployment index.
   169  		resp, err := w.getDeploys(ctx, dindex)
   170  		if err != nil {
   171  			if err == context.Canceled || ctx.Err() == context.Canceled {
   172  				return
   173  			}
   174  
   175  			w.logger.Printf("[ERR] nomad.deployments_watcher: failed to retrieve deploylements: %v", err)
   176  		}
   177  
   178  		// Guard against npe
   179  		if resp == nil {
   180  			continue
   181  		}
   182  
   183  		// Ensure we are tracking the things we should and not tracking what we
   184  		// shouldn't be
   185  		for _, d := range resp.Deployments {
   186  			if d.Active() {
   187  				if err := w.add(d); err != nil {
   188  					w.logger.Printf("[ERR] nomad.deployments_watcher: failed to track deployment %q: %v", d.ID, err)
   189  				}
   190  			} else {
   191  				w.remove(d)
   192  			}
   193  		}
   194  
   195  		// Update the latest index
   196  		dindex = resp.Index
   197  	}
   198  }
   199  
   200  // getDeploys retrieves all deployments blocking at the given index.
   201  func (w *Watcher) getDeploys(ctx context.Context, index uint64) (*structs.DeploymentListResponse, error) {
   202  	// Build the request
   203  	args := &structs.DeploymentListRequest{
   204  		QueryOptions: structs.QueryOptions{
   205  			MinQueryIndex: index,
   206  		},
   207  	}
   208  	var resp structs.DeploymentListResponse
   209  
   210  	for resp.Index <= index {
   211  		if err := w.queryLimiter.Wait(ctx); err != nil {
   212  			return nil, err
   213  		}
   214  
   215  		if err := w.stateWatchers.List(args, &resp); err != nil {
   216  			return nil, err
   217  		}
   218  	}
   219  
   220  	return &resp, nil
   221  }
   222  
   223  // add adds a deployment to the watch list
   224  func (w *Watcher) add(d *structs.Deployment) error {
   225  	w.l.Lock()
   226  	defer w.l.Unlock()
   227  	_, err := w.addLocked(d)
   228  	return err
   229  }
   230  
   231  // addLocked adds a deployment to the watch list and should only be called when
   232  // locked.
   233  func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) {
   234  	// Not enabled so no-op
   235  	if !w.enabled {
   236  		return nil, nil
   237  	}
   238  
   239  	if !d.Active() {
   240  		return nil, fmt.Errorf("deployment %q is terminal", d.ID)
   241  	}
   242  
   243  	// Already watched so no-op
   244  	if _, ok := w.watchers[d.ID]; ok {
   245  		return nil, nil
   246  	}
   247  
   248  	// Get the job the deployment is referencing
   249  	args := &structs.JobSpecificRequest{
   250  		JobID: d.JobID,
   251  	}
   252  	var resp structs.SingleJobResponse
   253  	if err := w.stateWatchers.GetJob(args, &resp); err != nil {
   254  		return nil, err
   255  	}
   256  	if resp.Job == nil {
   257  		return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID)
   258  	}
   259  
   260  	watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.stateWatchers, d, resp.Job, w)
   261  	w.watchers[d.ID] = watcher
   262  	return watcher, nil
   263  }
   264  
   265  // remove stops watching a deployment. This can be because the deployment is
   266  // complete or being deleted.
   267  func (w *Watcher) remove(d *structs.Deployment) {
   268  	w.l.Lock()
   269  	defer w.l.Unlock()
   270  
   271  	// Not enabled so no-op
   272  	if !w.enabled {
   273  		return
   274  	}
   275  
   276  	if watcher, ok := w.watchers[d.ID]; ok {
   277  		watcher.StopWatch()
   278  		delete(w.watchers, d.ID)
   279  	}
   280  }
   281  
   282  // forceAdd is used to force a lookup of the given deployment object and create
   283  // a watcher. If the deployment does not exist or is terminal an error is
   284  // returned.
   285  func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) {
   286  	// Build the request
   287  	args := &structs.DeploymentSpecificRequest{DeploymentID: dID}
   288  	var resp structs.SingleDeploymentResponse
   289  	if err := w.stateWatchers.GetDeployment(args, &resp); err != nil {
   290  		return nil, err
   291  	}
   292  
   293  	if resp.Deployment == nil {
   294  		return nil, fmt.Errorf("unknown deployment %q", dID)
   295  	}
   296  
   297  	return w.addLocked(resp.Deployment)
   298  }
   299  
   300  // getOrCreateWatcher returns the deployment watcher for the given deployment ID.
   301  func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) {
   302  	w.l.Lock()
   303  	defer w.l.Unlock()
   304  
   305  	// Not enabled so no-op
   306  	if !w.enabled {
   307  		return nil, notEnabled
   308  	}
   309  
   310  	watcher, ok := w.watchers[dID]
   311  	if ok {
   312  		return watcher, nil
   313  	}
   314  
   315  	return w.forceAdd(dID)
   316  }
   317  
   318  // SetAllocHealth is used to set the health of allocations for a deployment. If
   319  // there are any unhealthy allocations, the deployment is updated to be failed.
   320  // Otherwise the allocations are updated and an evaluation is created.
   321  func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error {
   322  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   323  	if err != nil {
   324  		return err
   325  	}
   326  
   327  	return watcher.SetAllocHealth(req, resp)
   328  }
   329  
   330  // PromoteDeployment is used to promote a deployment. If promote is false,
   331  // deployment is marked as failed. Otherwise the deployment is updated and an
   332  // evaluation is created.
   333  func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error {
   334  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   335  	if err != nil {
   336  		return err
   337  	}
   338  
   339  	return watcher.PromoteDeployment(req, resp)
   340  }
   341  
   342  // PauseDeployment is used to toggle the pause state on a deployment. If the
   343  // deployment is being unpaused, an evaluation is created.
   344  func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error {
   345  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   346  	if err != nil {
   347  		return err
   348  	}
   349  
   350  	return watcher.PauseDeployment(req, resp)
   351  }
   352  
   353  // FailDeployment is used to fail the deployment.
   354  func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error {
   355  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   356  	if err != nil {
   357  		return err
   358  	}
   359  
   360  	return watcher.FailDeployment(req, resp)
   361  }
   362  
   363  // createEvaluation commits the given evaluation to Raft but batches the commit
   364  // with other calls.
   365  func (w *Watcher) createEvaluation(eval *structs.Evaluation) (uint64, error) {
   366  	return w.evalBatcher.CreateEval(eval).Results()
   367  }
   368  
   369  // upsertJob commits the given job to Raft
   370  func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) {
   371  	return w.raft.UpsertJob(job)
   372  }
   373  
   374  // upsertDeploymentStatusUpdate commits the given deployment update and optional
   375  // evaluation to Raft
   376  func (w *Watcher) upsertDeploymentStatusUpdate(
   377  	u *structs.DeploymentStatusUpdate,
   378  	e *structs.Evaluation,
   379  	j *structs.Job) (uint64, error) {
   380  	return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{
   381  		DeploymentUpdate: u,
   382  		Eval:             e,
   383  		Job:              j,
   384  	})
   385  }
   386  
   387  // upsertDeploymentPromotion commits the given deployment promotion to Raft
   388  func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) {
   389  	return w.raft.UpdateDeploymentPromotion(req)
   390  }
   391  
   392  // upsertDeploymentAllocHealth commits the given allocation health changes to
   393  // Raft
   394  func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) {
   395  	return w.raft.UpdateDeploymentAllocHealth(req)
   396  }