github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/deploymentwatcher/deployments_watcher.go (about)

     1  package deploymentwatcher
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"sync"
     8  	"time"
     9  
    10  	"golang.org/x/time/rate"
    11  
    12  	memdb "github.com/hashicorp/go-memdb"
    13  	"github.com/hashicorp/nomad/nomad/state"
    14  	"github.com/hashicorp/nomad/nomad/structs"
    15  )
    16  
    17  const (
    18  	// LimitStateQueriesPerSecond is the number of state queries allowed per
    19  	// second
    20  	LimitStateQueriesPerSecond = 100.0
    21  
    22  	// CrossDeploymentUpdateBatchDuration is the duration in which allocation
    23  	// desired transition and evaluation creation updates are batched across
    24  	// all deployment watchers before committing to Raft.
    25  	CrossDeploymentUpdateBatchDuration = 250 * time.Millisecond
    26  )
    27  
    28  var (
    29  	// notEnabled is the error returned when the deployment watcher is not
    30  	// enabled
    31  	notEnabled = fmt.Errorf("deployment watcher not enabled")
    32  )
    33  
    34  // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions
    35  // to apply data transforms via Raft.
    36  type DeploymentRaftEndpoints interface {
    37  	// UpsertJob is used to upsert a job
    38  	UpsertJob(job *structs.Job) (uint64, error)
    39  
    40  	// UpdateDeploymentStatus is used to make a deployment status update
    41  	// and potentially create an evaluation.
    42  	UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error)
    43  
    44  	// UpdateDeploymentPromotion is used to promote canaries in a deployment
    45  	UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error)
    46  
    47  	// UpdateDeploymentAllocHealth is used to set the health of allocations in a
    48  	// deployment
    49  	UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error)
    50  
    51  	// UpdateAllocDesiredTransition is used to update the desired transition
    52  	// for allocations.
    53  	UpdateAllocDesiredTransition(req *structs.AllocUpdateDesiredTransitionRequest) (uint64, error)
    54  }
    55  
    56  // Watcher is used to watch deployments and their allocations created
    57  // by the scheduler and trigger the scheduler when allocation health
    58  // transitions.
    59  type Watcher struct {
    60  	enabled bool
    61  	logger  *log.Logger
    62  
    63  	// queryLimiter is used to limit the rate of blocking queries
    64  	queryLimiter *rate.Limiter
    65  
    66  	// updateBatchDuration is the duration to batch allocation desired
    67  	// transition and eval creation across all deployment watchers
    68  	updateBatchDuration time.Duration
    69  
    70  	// raft contains the set of Raft endpoints that can be used by the
    71  	// deployments watcher
    72  	raft DeploymentRaftEndpoints
    73  
    74  	// state is the state that is watched for state changes.
    75  	state *state.StateStore
    76  
    77  	// watchers is the set of active watchers, one per deployment
    78  	watchers map[string]*deploymentWatcher
    79  
    80  	// allocUpdateBatcher is used to batch the creation of evaluations and
    81  	// allocation desired transition updates
    82  	allocUpdateBatcher *AllocUpdateBatcher
    83  
    84  	// ctx and exitFn are used to cancel the watcher
    85  	ctx    context.Context
    86  	exitFn context.CancelFunc
    87  
    88  	l sync.RWMutex
    89  }
    90  
    91  // NewDeploymentsWatcher returns a deployments watcher that is used to watch
    92  // deployments and trigger the scheduler as needed.
    93  func NewDeploymentsWatcher(logger *log.Logger,
    94  	raft DeploymentRaftEndpoints, stateQueriesPerSecond float64,
    95  	updateBatchDuration time.Duration) *Watcher {
    96  
    97  	return &Watcher{
    98  		raft:                raft,
    99  		queryLimiter:        rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100),
   100  		updateBatchDuration: updateBatchDuration,
   101  		logger:              logger,
   102  	}
   103  }
   104  
   105  // SetEnabled is used to control if the watcher is enabled. The watcher
   106  // should only be enabled on the active leader. When being enabled the state is
   107  // passed in as it is no longer valid once a leader election has taken place.
   108  func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) {
   109  	w.l.Lock()
   110  	defer w.l.Unlock()
   111  
   112  	wasEnabled := w.enabled
   113  	w.enabled = enabled
   114  
   115  	if state != nil {
   116  		w.state = state
   117  	}
   118  
   119  	// Flush the state to create the necessary objects
   120  	w.flush()
   121  
   122  	// If we are starting now, launch the watch daemon
   123  	if enabled && !wasEnabled {
   124  		go w.watchDeployments(w.ctx)
   125  	}
   126  }
   127  
   128  // flush is used to clear the state of the watcher
   129  func (w *Watcher) flush() {
   130  	// Stop all the watchers and clear it
   131  	for _, watcher := range w.watchers {
   132  		watcher.StopWatch()
   133  	}
   134  
   135  	// Kill everything associated with the watcher
   136  	if w.exitFn != nil {
   137  		w.exitFn()
   138  	}
   139  
   140  	w.watchers = make(map[string]*deploymentWatcher, 32)
   141  	w.ctx, w.exitFn = context.WithCancel(context.Background())
   142  	w.allocUpdateBatcher = NewAllocUpdateBatcher(w.updateBatchDuration, w.raft, w.ctx)
   143  }
   144  
   145  // watchDeployments is the long lived go-routine that watches for deployments to
   146  // add and remove watchers on.
   147  func (w *Watcher) watchDeployments(ctx context.Context) {
   148  	dindex := uint64(1)
   149  	for {
   150  		// Block getting all deployments using the last deployment index.
   151  		deployments, idx, err := w.getDeploys(ctx, dindex)
   152  		if err != nil {
   153  			if err == context.Canceled {
   154  				return
   155  			}
   156  
   157  			w.logger.Printf("[ERR] nomad.deployments_watcher: failed to retrieve deployments: %v", err)
   158  		}
   159  
   160  		// Update the latest index
   161  		dindex = idx
   162  
   163  		// Ensure we are tracking the things we should and not tracking what we
   164  		// shouldn't be
   165  		for _, d := range deployments {
   166  			if d.Active() {
   167  				if err := w.add(d); err != nil {
   168  					w.logger.Printf("[ERR] nomad.deployments_watcher: failed to track deployment %q: %v", d.ID, err)
   169  				}
   170  			} else {
   171  				w.remove(d)
   172  			}
   173  		}
   174  	}
   175  }
   176  
   177  // getDeploys retrieves all deployments blocking at the given index.
   178  func (w *Watcher) getDeploys(ctx context.Context, minIndex uint64) ([]*structs.Deployment, uint64, error) {
   179  	resp, index, err := w.state.BlockingQuery(w.getDeploysImpl, minIndex, ctx)
   180  	if err != nil {
   181  		return nil, 0, err
   182  	}
   183  
   184  	return resp.([]*structs.Deployment), index, nil
   185  }
   186  
   187  // getDeploysImpl retrieves all deployments from the passed state store.
   188  func (w *Watcher) getDeploysImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   189  
   190  	iter, err := state.Deployments(ws)
   191  	if err != nil {
   192  		return nil, 0, err
   193  	}
   194  
   195  	var deploys []*structs.Deployment
   196  	for {
   197  		raw := iter.Next()
   198  		if raw == nil {
   199  			break
   200  		}
   201  		deploy := raw.(*structs.Deployment)
   202  		deploys = append(deploys, deploy)
   203  	}
   204  
   205  	// Use the last index that affected the deployment table
   206  	index, err := state.Index("deployment")
   207  	if err != nil {
   208  		return nil, 0, err
   209  	}
   210  
   211  	return deploys, index, nil
   212  }
   213  
   214  // add adds a deployment to the watch list
   215  func (w *Watcher) add(d *structs.Deployment) error {
   216  	w.l.Lock()
   217  	defer w.l.Unlock()
   218  	_, err := w.addLocked(d)
   219  	return err
   220  }
   221  
   222  // addLocked adds a deployment to the watch list and should only be called when
   223  // locked.
   224  func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) {
   225  	// Not enabled so no-op
   226  	if !w.enabled {
   227  		return nil, nil
   228  	}
   229  
   230  	if !d.Active() {
   231  		return nil, fmt.Errorf("deployment %q is terminal", d.ID)
   232  	}
   233  
   234  	// Already watched so just update the deployment
   235  	if w, ok := w.watchers[d.ID]; ok {
   236  		w.updateDeployment(d)
   237  		return nil, nil
   238  	}
   239  
   240  	// Get the job the deployment is referencing
   241  	snap, err := w.state.Snapshot()
   242  	if err != nil {
   243  		return nil, err
   244  	}
   245  
   246  	job, err := snap.JobByID(nil, d.Namespace, d.JobID)
   247  	if err != nil {
   248  		return nil, err
   249  	}
   250  	if job == nil {
   251  		return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID)
   252  	}
   253  
   254  	watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.state, d, job, w)
   255  	w.watchers[d.ID] = watcher
   256  	return watcher, nil
   257  }
   258  
   259  // remove stops watching a deployment. This can be because the deployment is
   260  // complete or being deleted.
   261  func (w *Watcher) remove(d *structs.Deployment) {
   262  	w.l.Lock()
   263  	defer w.l.Unlock()
   264  
   265  	// Not enabled so no-op
   266  	if !w.enabled {
   267  		return
   268  	}
   269  
   270  	if watcher, ok := w.watchers[d.ID]; ok {
   271  		watcher.StopWatch()
   272  		delete(w.watchers, d.ID)
   273  	}
   274  }
   275  
   276  // forceAdd is used to force a lookup of the given deployment object and create
   277  // a watcher. If the deployment does not exist or is terminal an error is
   278  // returned.
   279  func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) {
   280  	snap, err := w.state.Snapshot()
   281  	if err != nil {
   282  		return nil, err
   283  	}
   284  
   285  	deployment, err := snap.DeploymentByID(nil, dID)
   286  	if err != nil {
   287  		return nil, err
   288  	}
   289  
   290  	if deployment == nil {
   291  		return nil, fmt.Errorf("unknown deployment %q", dID)
   292  	}
   293  
   294  	return w.addLocked(deployment)
   295  }
   296  
   297  // getOrCreateWatcher returns the deployment watcher for the given deployment ID.
   298  func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) {
   299  	w.l.Lock()
   300  	defer w.l.Unlock()
   301  
   302  	// Not enabled so no-op
   303  	if !w.enabled {
   304  		return nil, notEnabled
   305  	}
   306  
   307  	watcher, ok := w.watchers[dID]
   308  	if ok {
   309  		return watcher, nil
   310  	}
   311  
   312  	return w.forceAdd(dID)
   313  }
   314  
   315  // SetAllocHealth is used to set the health of allocations for a deployment. If
   316  // there are any unhealthy allocations, the deployment is updated to be failed.
   317  // Otherwise the allocations are updated and an evaluation is created.
   318  func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error {
   319  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   320  	if err != nil {
   321  		return err
   322  	}
   323  
   324  	return watcher.SetAllocHealth(req, resp)
   325  }
   326  
   327  // PromoteDeployment is used to promote a deployment. If promote is false,
   328  // deployment is marked as failed. Otherwise the deployment is updated and an
   329  // evaluation is created.
   330  func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error {
   331  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   332  	if err != nil {
   333  		return err
   334  	}
   335  
   336  	return watcher.PromoteDeployment(req, resp)
   337  }
   338  
   339  // PauseDeployment is used to toggle the pause state on a deployment. If the
   340  // deployment is being unpaused, an evaluation is created.
   341  func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error {
   342  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   343  	if err != nil {
   344  		return err
   345  	}
   346  
   347  	return watcher.PauseDeployment(req, resp)
   348  }
   349  
   350  // FailDeployment is used to fail the deployment.
   351  func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error {
   352  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   353  	if err != nil {
   354  		return err
   355  	}
   356  
   357  	return watcher.FailDeployment(req, resp)
   358  }
   359  
   360  // createUpdate commits the given allocation desired transition and evaluation
   361  // to Raft but batches the commit with other calls.
   362  func (w *Watcher) createUpdate(allocs map[string]*structs.DesiredTransition, eval *structs.Evaluation) (uint64, error) {
   363  	return w.allocUpdateBatcher.CreateUpdate(allocs, eval).Results()
   364  }
   365  
   366  // upsertJob commits the given job to Raft
   367  func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) {
   368  	return w.raft.UpsertJob(job)
   369  }
   370  
   371  // upsertDeploymentStatusUpdate commits the given deployment update and optional
   372  // evaluation to Raft
   373  func (w *Watcher) upsertDeploymentStatusUpdate(
   374  	u *structs.DeploymentStatusUpdate,
   375  	e *structs.Evaluation,
   376  	j *structs.Job) (uint64, error) {
   377  	return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{
   378  		DeploymentUpdate: u,
   379  		Eval:             e,
   380  		Job:              j,
   381  	})
   382  }
   383  
   384  // upsertDeploymentPromotion commits the given deployment promotion to Raft
   385  func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) {
   386  	return w.raft.UpdateDeploymentPromotion(req)
   387  }
   388  
   389  // upsertDeploymentAllocHealth commits the given allocation health changes to
   390  // Raft
   391  func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) {
   392  	return w.raft.UpdateDeploymentAllocHealth(req)
   393  }