github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/nomad/deploymentwatcher/deployments_watcher.go

github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/nomad/deploymentwatcher/deployments_watcher.go (about)

     1  package deploymentwatcher
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"sync"
     8  	"time"
     9  
    10  	"golang.org/x/time/rate"
    11  
    12  	memdb "github.com/hashicorp/go-memdb"
    13  	"github.com/hashicorp/nomad/nomad/state"
    14  	"github.com/hashicorp/nomad/nomad/structs"
    15  )
    16  
    17  const (
    18  	// LimitStateQueriesPerSecond is the number of state queries allowed per
    19  	// second
    20  	LimitStateQueriesPerSecond = 100.0
    21  
    22  	// CrossDeploymentEvalBatchDuration is the duration in which evaluations are
    23  	// batched across all deployment watchers before committing to Raft.
    24  	CrossDeploymentEvalBatchDuration = 250 * time.Millisecond
    25  )
    26  
    27  var (
    28  	// notEnabled is the error returned when the deployment watcher is not
    29  	// enabled
    30  	notEnabled = fmt.Errorf("deployment watcher not enabled")
    31  )
    32  
    33  // DeploymentRaftEndpoints exposes the deployment watcher to a set of functions
    34  // to apply data transforms via Raft.
    35  type DeploymentRaftEndpoints interface {
    36  	// UpsertEvals is used to upsert a set of evaluations
    37  	UpsertEvals([]*structs.Evaluation) (uint64, error)
    38  
    39  	// UpsertJob is used to upsert a job
    40  	UpsertJob(job *structs.Job) (uint64, error)
    41  
    42  	// UpdateDeploymentStatus is used to make a deployment status update
    43  	// and potentially create an evaluation.
    44  	UpdateDeploymentStatus(u *structs.DeploymentStatusUpdateRequest) (uint64, error)
    45  
    46  	// UpdateDeploymentPromotion is used to promote canaries in a deployment
    47  	UpdateDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error)
    48  
    49  	// UpdateDeploymentAllocHealth is used to set the health of allocations in a
    50  	// deployment
    51  	UpdateDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error)
    52  }
    53  
    54  // Watcher is used to watch deployments and their allocations created
    55  // by the scheduler and trigger the scheduler when allocation health
    56  // transistions.
    57  type Watcher struct {
    58  	enabled bool
    59  	logger  *log.Logger
    60  
    61  	// queryLimiter is used to limit the rate of blocking queries
    62  	queryLimiter *rate.Limiter
    63  
    64  	// evalBatchDuration is the duration to batch eval creation across all
    65  	// deployment watchers
    66  	evalBatchDuration time.Duration
    67  
    68  	// raft contains the set of Raft endpoints that can be used by the
    69  	// deployments watcher
    70  	raft DeploymentRaftEndpoints
    71  
    72  	// state is the state that is watched for state changes.
    73  	state *state.StateStore
    74  
    75  	// watchers is the set of active watchers, one per deployment
    76  	watchers map[string]*deploymentWatcher
    77  
    78  	// evalBatcher is used to batch the creation of evaluations
    79  	evalBatcher *EvalBatcher
    80  
    81  	// ctx and exitFn are used to cancel the watcher
    82  	ctx    context.Context
    83  	exitFn context.CancelFunc
    84  
    85  	l sync.RWMutex
    86  }
    87  
    88  // NewDeploymentsWatcher returns a deployments watcher that is used to watch
    89  // deployments and trigger the scheduler as needed.
    90  func NewDeploymentsWatcher(logger *log.Logger,
    91  	raft DeploymentRaftEndpoints, stateQueriesPerSecond float64,
    92  	evalBatchDuration time.Duration) *Watcher {
    93  
    94  	return &Watcher{
    95  		raft:              raft,
    96  		queryLimiter:      rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100),
    97  		evalBatchDuration: evalBatchDuration,
    98  		logger:            logger,
    99  	}
   100  }
   101  
   102  // SetEnabled is used to control if the watcher is enabled. The watcher
   103  // should only be enabled on the active leader. When being enabled the state is
   104  // passsed in as it is no longer valid once a leader election has taken place.
   105  func (w *Watcher) SetEnabled(enabled bool, state *state.StateStore) error {
   106  	w.l.Lock()
   107  	defer w.l.Unlock()
   108  
   109  	wasEnabled := w.enabled
   110  	w.enabled = enabled
   111  
   112  	if state != nil {
   113  		w.state = state
   114  	}
   115  
   116  	// Flush the state to create the necessary objects
   117  	w.flush()
   118  
   119  	// If we are starting now, launch the watch daemon
   120  	if enabled && !wasEnabled {
   121  		go w.watchDeployments(w.ctx)
   122  	}
   123  
   124  	return nil
   125  }
   126  
   127  // flush is used to clear the state of the watcher
   128  func (w *Watcher) flush() {
   129  	// Stop all the watchers and clear it
   130  	for _, watcher := range w.watchers {
   131  		watcher.StopWatch()
   132  	}
   133  
   134  	// Kill everything associated with the watcher
   135  	if w.exitFn != nil {
   136  		w.exitFn()
   137  	}
   138  
   139  	w.watchers = make(map[string]*deploymentWatcher, 32)
   140  	w.ctx, w.exitFn = context.WithCancel(context.Background())
   141  	w.evalBatcher = NewEvalBatcher(w.evalBatchDuration, w.raft, w.ctx)
   142  }
   143  
   144  // watchDeployments is the long lived go-routine that watches for deployments to
   145  // add and remove watchers on.
   146  func (w *Watcher) watchDeployments(ctx context.Context) {
   147  	dindex := uint64(1)
   148  	for {
   149  		// Block getting all deployments using the last deployment index.
   150  		deployments, idx, err := w.getDeploys(ctx, dindex)
   151  		if err != nil {
   152  			if err == context.Canceled {
   153  				return
   154  			}
   155  
   156  			w.logger.Printf("[ERR] nomad.deployments_watcher: failed to retrieve deploylements: %v", err)
   157  		}
   158  
   159  		// Update the latest index
   160  		dindex = idx
   161  
   162  		// Ensure we are tracking the things we should and not tracking what we
   163  		// shouldn't be
   164  		for _, d := range deployments {
   165  			if d.Active() {
   166  				if err := w.add(d); err != nil {
   167  					w.logger.Printf("[ERR] nomad.deployments_watcher: failed to track deployment %q: %v", d.ID, err)
   168  				}
   169  			} else {
   170  				w.remove(d)
   171  			}
   172  		}
   173  	}
   174  }
   175  
   176  // getDeploys retrieves all deployments blocking at the given index.
   177  func (w *Watcher) getDeploys(ctx context.Context, minIndex uint64) ([]*structs.Deployment, uint64, error) {
   178  	resp, index, err := w.state.BlockingQuery(w.getDeploysImpl, minIndex, ctx)
   179  	if err != nil {
   180  		return nil, 0, err
   181  	}
   182  
   183  	return resp.([]*structs.Deployment), index, nil
   184  }
   185  
   186  // getDeploysImpl retrieves all deployments from the passed state store.
   187  func (w *Watcher) getDeploysImpl(ws memdb.WatchSet, state *state.StateStore) (interface{}, uint64, error) {
   188  
   189  	iter, err := state.Deployments(ws)
   190  	if err != nil {
   191  		return nil, 0, err
   192  	}
   193  
   194  	var deploys []*structs.Deployment
   195  	for {
   196  		raw := iter.Next()
   197  		if raw == nil {
   198  			break
   199  		}
   200  		deploy := raw.(*structs.Deployment)
   201  		deploys = append(deploys, deploy)
   202  	}
   203  
   204  	// Use the last index that affected the deployment table
   205  	index, err := state.Index("deployment")
   206  	if err != nil {
   207  		return nil, 0, err
   208  	}
   209  
   210  	return deploys, index, nil
   211  }
   212  
   213  // add adds a deployment to the watch list
   214  func (w *Watcher) add(d *structs.Deployment) error {
   215  	w.l.Lock()
   216  	defer w.l.Unlock()
   217  	_, err := w.addLocked(d)
   218  	return err
   219  }
   220  
   221  // addLocked adds a deployment to the watch list and should only be called when
   222  // locked.
   223  func (w *Watcher) addLocked(d *structs.Deployment) (*deploymentWatcher, error) {
   224  	// Not enabled so no-op
   225  	if !w.enabled {
   226  		return nil, nil
   227  	}
   228  
   229  	if !d.Active() {
   230  		return nil, fmt.Errorf("deployment %q is terminal", d.ID)
   231  	}
   232  
   233  	// Already watched so no-op
   234  	if _, ok := w.watchers[d.ID]; ok {
   235  		return nil, nil
   236  	}
   237  
   238  	// Get the job the deployment is referencing
   239  	snap, err := w.state.Snapshot()
   240  	if err != nil {
   241  		return nil, err
   242  	}
   243  
   244  	job, err := snap.JobByID(nil, d.Namespace, d.JobID)
   245  	if err != nil {
   246  		return nil, err
   247  	}
   248  	if job == nil {
   249  		return nil, fmt.Errorf("deployment %q references unknown job %q", d.ID, d.JobID)
   250  	}
   251  
   252  	watcher := newDeploymentWatcher(w.ctx, w.queryLimiter, w.logger, w.state, d, job, w)
   253  	w.watchers[d.ID] = watcher
   254  	return watcher, nil
   255  }
   256  
   257  // remove stops watching a deployment. This can be because the deployment is
   258  // complete or being deleted.
   259  func (w *Watcher) remove(d *structs.Deployment) {
   260  	w.l.Lock()
   261  	defer w.l.Unlock()
   262  
   263  	// Not enabled so no-op
   264  	if !w.enabled {
   265  		return
   266  	}
   267  
   268  	if watcher, ok := w.watchers[d.ID]; ok {
   269  		watcher.StopWatch()
   270  		delete(w.watchers, d.ID)
   271  	}
   272  }
   273  
   274  // forceAdd is used to force a lookup of the given deployment object and create
   275  // a watcher. If the deployment does not exist or is terminal an error is
   276  // returned.
   277  func (w *Watcher) forceAdd(dID string) (*deploymentWatcher, error) {
   278  	snap, err := w.state.Snapshot()
   279  	if err != nil {
   280  		return nil, err
   281  	}
   282  
   283  	deployment, err := snap.DeploymentByID(nil, dID)
   284  	if err != nil {
   285  		return nil, err
   286  	}
   287  
   288  	if deployment == nil {
   289  		return nil, fmt.Errorf("unknown deployment %q", dID)
   290  	}
   291  
   292  	return w.addLocked(deployment)
   293  }
   294  
   295  // getOrCreateWatcher returns the deployment watcher for the given deployment ID.
   296  func (w *Watcher) getOrCreateWatcher(dID string) (*deploymentWatcher, error) {
   297  	w.l.Lock()
   298  	defer w.l.Unlock()
   299  
   300  	// Not enabled so no-op
   301  	if !w.enabled {
   302  		return nil, notEnabled
   303  	}
   304  
   305  	watcher, ok := w.watchers[dID]
   306  	if ok {
   307  		return watcher, nil
   308  	}
   309  
   310  	return w.forceAdd(dID)
   311  }
   312  
   313  // SetAllocHealth is used to set the health of allocations for a deployment. If
   314  // there are any unhealthy allocations, the deployment is updated to be failed.
   315  // Otherwise the allocations are updated and an evaluation is created.
   316  func (w *Watcher) SetAllocHealth(req *structs.DeploymentAllocHealthRequest, resp *structs.DeploymentUpdateResponse) error {
   317  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   318  	if err != nil {
   319  		return err
   320  	}
   321  
   322  	return watcher.SetAllocHealth(req, resp)
   323  }
   324  
   325  // PromoteDeployment is used to promote a deployment. If promote is false,
   326  // deployment is marked as failed. Otherwise the deployment is updated and an
   327  // evaluation is created.
   328  func (w *Watcher) PromoteDeployment(req *structs.DeploymentPromoteRequest, resp *structs.DeploymentUpdateResponse) error {
   329  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   330  	if err != nil {
   331  		return err
   332  	}
   333  
   334  	return watcher.PromoteDeployment(req, resp)
   335  }
   336  
   337  // PauseDeployment is used to toggle the pause state on a deployment. If the
   338  // deployment is being unpaused, an evaluation is created.
   339  func (w *Watcher) PauseDeployment(req *structs.DeploymentPauseRequest, resp *structs.DeploymentUpdateResponse) error {
   340  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   341  	if err != nil {
   342  		return err
   343  	}
   344  
   345  	return watcher.PauseDeployment(req, resp)
   346  }
   347  
   348  // FailDeployment is used to fail the deployment.
   349  func (w *Watcher) FailDeployment(req *structs.DeploymentFailRequest, resp *structs.DeploymentUpdateResponse) error {
   350  	watcher, err := w.getOrCreateWatcher(req.DeploymentID)
   351  	if err != nil {
   352  		return err
   353  	}
   354  
   355  	return watcher.FailDeployment(req, resp)
   356  }
   357  
   358  // createEvaluation commits the given evaluation to Raft but batches the commit
   359  // with other calls.
   360  func (w *Watcher) createEvaluation(eval *structs.Evaluation) (uint64, error) {
   361  	return w.evalBatcher.CreateEval(eval).Results()
   362  }
   363  
   364  // upsertJob commits the given job to Raft
   365  func (w *Watcher) upsertJob(job *structs.Job) (uint64, error) {
   366  	return w.raft.UpsertJob(job)
   367  }
   368  
   369  // upsertDeploymentStatusUpdate commits the given deployment update and optional
   370  // evaluation to Raft
   371  func (w *Watcher) upsertDeploymentStatusUpdate(
   372  	u *structs.DeploymentStatusUpdate,
   373  	e *structs.Evaluation,
   374  	j *structs.Job) (uint64, error) {
   375  	return w.raft.UpdateDeploymentStatus(&structs.DeploymentStatusUpdateRequest{
   376  		DeploymentUpdate: u,
   377  		Eval:             e,
   378  		Job:              j,
   379  	})
   380  }
   381  
   382  // upsertDeploymentPromotion commits the given deployment promotion to Raft
   383  func (w *Watcher) upsertDeploymentPromotion(req *structs.ApplyDeploymentPromoteRequest) (uint64, error) {
   384  	return w.raft.UpdateDeploymentPromotion(req)
   385  }
   386  
   387  // upsertDeploymentAllocHealth commits the given allocation health changes to
   388  // Raft
   389  func (w *Watcher) upsertDeploymentAllocHealth(req *structs.ApplyDeploymentAllocHealthRequest) (uint64, error) {
   390  	return w.raft.UpdateDeploymentAllocHealth(req)
   391  }